Example #1
0
def predictions(dataframe):
    
    dataframe['log_ENTRIESn_hourly'] = np.log1p(dataframe.ENTRIESn_hourly) # log transformation 
    
    features = dataframe[[]] # option 2: features = dataframe[['meantempi', 'rain']]
    dummy_unit = pd.get_dummies(dataframe['UNIT'], prefix='unit')
    dummy_hour = pd.get_dummies(dataframe['hour'], prefix='hour')
    dummy_day_week = pd.get_dummies(dataframe['day_week'], prefix='day_week')
    features = features.join(dummy_hour).join(dummy_day_week).join(dummy_unit) #join(dummy_rain).
    
#    removing one dummy from each group to avoid dummy variable trap
    features.drop(['unit_R003'], axis = 1, inplace = True) 
    features.drop(['hour_0'], axis = 1, inplace = True)
    features.drop(['day_week_0'], axis = 1, inplace = True)   
    values = dataframe['ENTRIESn_hourly']
    values_log = dataframe['log_ENTRIESn_hourly']
    
#    Perform linear regression
    intercept, params = linear_regression_SGD(features, values_log)    
    log_predictions = intercept + np.dot(features, params)
    log_predictions[log_predictions<0] = 1
    predictions = np.expm1(log_predictions) # inverse logarithmic transformation to produce ENTRIESn_hourly   
    residuals = values - predictions

    return predictions
Example #2
0
 def transform(self, X_df):
     X_encoded = X_df
     
     #uncomment the line below in the submission
     path = os.path.dirname(__file__)
     special_days=pd.read_csv(os.path.join(path, "data_specialdays.csv"), sep = ';')
     X_encoded = X_encoded.merge(special_days, how='left', left_on=['DateOfDeparture'], right_on=['DateOfDeparture'], sort=False)
     
     X_encoded = X_encoded.join(pd.get_dummies(X_encoded['Departure'], prefix='d'))
     X_encoded = X_encoded.join(pd.get_dummies(X_encoded['Arrival'], prefix='a'))
     
     X_encoded['DateOfDeparture'] = pd.to_datetime(X_encoded['DateOfDeparture'])
     X_encoded['year'] = X_encoded['DateOfDeparture'].dt.year
     X_encoded['weekday'] = X_encoded['DateOfDeparture'].dt.weekday
     X_encoded['week'] = X_encoded['DateOfDeparture'].dt.week
     X_encoded = X_encoded.join(pd.get_dummies(X_encoded['year'], prefix='y'))
     X_encoded = X_encoded.join(pd.get_dummies(X_encoded['weekday'], prefix='wd'))
     X_encoded = X_encoded.join(pd.get_dummies(X_encoded['week'], prefix='w'))
     
     X_encoded = X_encoded.drop('Departure', axis=1)
     X_encoded = X_encoded.drop('Arrival', axis=1) 
     X_encoded = X_encoded.drop('weekday', axis=1)
     X_encoded = X_encoded.drop('week', axis=1)
     X_encoded = X_encoded.drop('year', axis=1)
     X_encoded = X_encoded.drop('std_wtd', axis=1)
     X_encoded = X_encoded.drop('WeeksToDeparture', axis=1)        
     X_encoded = X_encoded.drop('DateOfDeparture', axis=1)
     X_array = X_encoded.values
     return X_array
def eval_wrapper(ypred, ytrue):  #pred true
    #make sure 012
    #ypred=np.concatenate((ypred,np.array([0,1,2]) )) ypred is [0.1 0.2 0.7]
    ytrue=np.concatenate((ytrue,np.array([0,1,2]) ))
    print ypred.shape,ytrue.shape
    if len(ytrue.shape)!=2:
	dmmat=pd.get_dummies(np.array(ytrue))
    	ytrue=dmmat.values #[n,3]
    if len(ypred.shape)!=2:
	dmmat=pd.get_dummies(np.array(ypred))
    	ypred=dmmat.values #[n,3]
    ytrue=ytrue[:-3,:]


    
    
    #y = np.array(y);print y[:10]
    #y = y.astype(int);print yhat[:10]
    #yhat = np.array(yhat)
    #yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)  
    #####accuracy
    #err=np.sum((y-yhat)*(y-yhat))/float(y.shape[0])
    #return err
    #######-loglikely
    return -np.mean( np.sum(ytrue*np.log(ypred+0.00001), axis=1) )#[n,3]->[n,]->1x1
def getdummy(rawData,categories,stage):
    #make a copy
    data = rawData.copy()
    if stage == "training":
        for category in categories:
            columns = list(data.columns.values)
            #print data[category]
            columnValues = set(data[category])
            #print columnValues
            dummy = pd.get_dummies(data[category],prefix=category)
            #print dummy.head(10)
            if dummy.shape[1] > 1:
                columns.remove(category)
                data = data[columns].join(dummy.ix[:,1:])
            elif dummy.shape[1] == 1:
                columns.remove(category)
                data = data[columns].join(dummy)
    if stage == "testing":
        #print categories
        columns = list(data.columns.values)
        for category in categories:
            columnValues = set(data[category])
            #print columnValues
            dummy = pd.get_dummies(data[category],prefix=category)
            #print dummy.head(10)
            dummyColumns = list(dummy.columns.values)
            for dummyColumn in dummyColumns:
                if dummyColumn in columns:
                    data[dummyColumn] = dummy[dummyColumn]
            columns.remove(category)
        data = data[columns]
            #print dummy.head(10)
    return(data)
def getdummy(rawData,categories,stage,shuffle=False):
    #make a copy
    data = rawData.copy()
    if stage == "training":
        for category in categories:
            columns = list(data.columns.values)
            columnValues = set(data[category])
            dummy = pd.get_dummies(data[category],prefix=category)
            if dummy.shape[1] > 1:
                columns.remove(category)
                data = data[columns].join(dummy.ix[:,1:])
            elif dummy.shape[1] == 1:
                columns.remove(category)
                data = data[columns].join(dummy)
        #shuffle data
        if shuffle == True:
            data = shuffledata(data)
    if stage == "testing":
        columns = list(data.columns.values)
        for category in categories:
            columnValues = set(data[category])
            dummy = pd.get_dummies(data[category],prefix=category)
            dummyColumns = list(dummy.columns.values)
            for dummyColumn in dummyColumns:
                if dummyColumn in columns:
                    data[dummyColumn] = dummy[dummyColumn]
            columns.remove(category)
        data = data[columns]
    return(data)
Example #6
0
    def test_basic(self, sparse, dtype):
        s_list = list('abc')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame({'a': [1, 0, 0],
                              'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype=self.effective_dtype(dtype))
        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
        if sparse:
            tm.assert_sp_frame_equal(result,
                                     expected.to_sparse(kind='integer',
                                                        fill_value=0))
        else:
            assert_frame_equal(result, expected)

        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
        if sparse:
            expected = expected.to_sparse(kind='integer', fill_value=0)
        assert_frame_equal(result, expected)

        expected.index = list('ABC')
        result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
        if sparse:
            expected.to_sparse(kind='integer', fill_value=0)
        assert_frame_equal(result, expected)
Example #7
0
def create_dummy_variables(df):
    #all the quantitave variables are collect with the describe function
    quant_variable = df.describe().columns.values
    column = df.columns.values
    df = pd.get_dummies(df)
    return df
    
    print("DUMMY:")
    print(dummy_variable.head(5))
    dummy_variable.to_csv("output/train_fixed.csv", dummy_na=True)

    for i in column:
        if i not in quant_variable:
            #we are with qualitative variable
            df[i].fillna("no_present", inplace=True)
            dummy_variable = pd.get_dummies(df[i], prefix=i)
            print("DUMMY:")
            print(dummy_variable.head(5))
            print("COLUMN: ", i)
            print(dummy_variable.info()) 
            for dummy in dummy_variable:
                #for value in dummy_variable[dummy]:
                #    print(value) 
                #df.loc[dummy] = dummy_variable[dummy]
                df[dummy] = dummy_variable[dummy]
            #df = df.join(dummy_variable)
            df.drop(i, axis=1, inplace=True)
            #df.reindex(columns = dummy_variable.columns)
            #print(test.info())
    df.to_csv("output/train_fixed2.csv")
    sys.exit()
    return df
Example #8
0
    def test_include_na(self, sparse, dtype):
        if sparse:
            pytest.xfail(reason='nan in index is problematic (GH 16894)')

        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=sparse, dtype=dtype)
        exp = DataFrame({'a': [1, 0, 0],
                         'b': [0, 1, 0]},
                        dtype=self.effective_dtype(dtype))
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
        exp_na = DataFrame({nan: [0, 0, 1],
                            'a': [1, 0, 0],
                            'b': [0, 1, 0]},
                           dtype=self.effective_dtype(dtype))
        exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True,
                                  sparse=sparse, dtype=dtype)
        exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
                                dtype=self.effective_dtype(dtype))
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
Example #9
0
    def test_dataframe_dummies_with_na(self, df, sparse, dtype):
        df.loc[3, :] = [np.nan, np.nan, np.nan]
        result = get_dummies(df, dummy_na=True,
                             sparse=sparse, dtype=dtype).sort_index(axis=1)

        if sparse:
            arr = SparseArray
            typ = SparseDtype(dtype, 0)
        else:
            arr = np.array
            typ = dtype

        expected = DataFrame({'C': [1, 2, 3, np.nan],
                              'A_a': arr([1, 0, 1, 0], dtype=typ),
                              'A_b': arr([0, 1, 0, 0], dtype=typ),
                              'A_nan': arr([0, 0, 0, 1], dtype=typ),
                              'B_b': arr([1, 1, 0, 0], dtype=typ),
                              'B_c': arr([0, 0, 1, 0], dtype=typ),
                              'B_nan': arr([0, 0, 0, 1], dtype=typ)
                              }).sort_index(axis=1)

        assert_frame_equal(result, expected)

        result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
        expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
        assert_frame_equal(result, expected)
def slide_17():
    df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
    print pd.get_dummies(df['key'])

    dummies = pd.get_dummies(df['key'], prefix='key')
    print dummies
    df_with_dummy = df[['data1']].join(dummies)
    print df_with_dummy

    mnames = ['movie_id', 'title', 'genres']
    movies = pd.read_table(MOVIELENSPATH,
                           sep='::',
                           header=None,
                           engine='python',
                           names=mnames)
    print movies[:10]

    genre_iter = (set(x.split('|')) for x in movies.genres)
    genres = sorted(set.union(*genre_iter))
    print genres
    dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)

    for i, gen in enumerate(movies.genres):
        dummies.ix[i, gen.split('|')] = 1

    movies_windic = movies.join(dummies.add_prefix('Genre_'))
    print movies_windic.ix[0]

    values = np.random.rand(10)
    print values
    bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

    print pd.get_dummies(pd.cut(values, bins))
Example #11
0
    def __init__(self, x_train, y_train, x_test=None, y_test=None, n_hidden_1=256, n_hidden_2=256, batch_size=100,
                 learning_rate=0.01, training_epochs=30,
                 display_step=-1):
        self.learning_rate = learning_rate
        self.training_epochs = training_epochs
        self.batch_size = batch_size
        if display_step == -1:
            display_step = int(training_epochs / 26)
        self.display_step = display_step

        self.n_hidden_1 = n_hidden_1  # 1st layer num features
        self.n_hidden_2 = n_hidden_2  # 2nd layer num features

        y_train = pd.get_dummies(y_train).as_matrix()
        if y_test is not None:
            y_test = pd.get_dummies(y_test).as_matrix()

        self.n_input = x_train.shape[1]
        self.n_classes = y_train.shape[1]

        self.x = tf.placeholder(tf.float32, [None, self.n_input])
        self.y = tf.placeholder(tf.float32, [None, self.n_classes])

        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
def prepare_data(original_data):
    
    
    original_data['TotalIncome'] = original_data['ApplicantIncome'] + original_data['CoapplicantIncome']
    original_data['TotalIncome_log'] = np.log(original_data['TotalIncome'])
    original_data['LoanAmount_log'] = np.log(original_data['LoanAmount'])
    
    one_hot_encoding_data = pd.concat([pd.get_dummies(original_data['Gender']),
                                       pd.get_dummies(original_data['Married'], prefix = "Married"),
                                       original_data['Dependents'],
                                       pd.get_dummies(original_data['Education'], prefix = "Education"),
                                       pd.get_dummies(original_data['Self_Employed'], prefix="Self_Employed"),
                                       original_data['TotalIncome_log'],#+original_data['CoapplicantIncome'],
                                       original_data['LoanAmount_log'],
                                       original_data['Loan_Amount_Term'],
                                       original_data['Credit_History'],
                                       pd.get_dummies(original_data['Property_Area'], prefix="Property_Area")                                       
                                       ], axis =1)
    
    #one_hot_encoding_data.drop('Female', 1, inplace=True)
    #one_hot_encoding_data.drop('Married_Yes', 1, inplace=True)
    #one_hot_encoding_data.drop('Education_Not Graduate', 1, inplace=True)    
    #one_hot_encoding_data.drop('Self_Employed_No', 1, inplace=True)
    
    one_hot_encoding_data[one_hot_encoding_data.Dependents == '3+'] = 5
    median_features = one_hot_encoding_data.dropna().median()    
    print median_features
    imputed_features = one_hot_encoding_data.fillna(median_features)
        
    #print imputed_features.head(1)
    #print imputed_features.count()
    
    return imputed_features
def predictions_sm(weather_turnstile):
    #
    # Your implementation goes here. Feel free to write additional
    # helper functions
    #
    values = weather_turnstile['ENTRIESn_hourly']

    # get weekday
    weather_turnstile['DATEn'] = pd.to_datetime(weather_turnstile['DATEn'])
    weather_turnstile['weekend'] = weather_turnstile['DATEn'].dt.dayofweek
    days = {0: 'Mon', 1: 'Tues', 2: 'Weds', 3: 'Thurs', 4: 'Fri', 5: 'Sat', 6: 'Sun'}
    weather_turnstile['weekend'] = weather_turnstile['weekend'].apply(lambda x: days[x])

    features = weather_turnstile[['maxpressurei', 'maxdewpti', 'mindewpti', 'minpressurei', 'meandewpti', 'precipi',
                                  'fog', 'rain', 'meanwindspdi', 'mintempi', 'meantempi', 'maxtempi', 'meanpressurei']]

    # dummy variables for UNIT, weekend and Hour
    dummy_units = pd.get_dummies(weather_turnstile['UNIT'], prefix='unit')
    features = features.join(dummy_units)
    dummy_units = pd.get_dummies(weather_turnstile['weekend'], prefix='day')
    features = features.join(dummy_units)
    dummy_units = pd.get_dummies(weather_turnstile['Hour'], prefix='hour')
    features = features.join(dummy_units)

    features, mu, sigma = normalize_features(features)

    features = sm.add_constant(features)

    # train, fit and predict model
    model = sm.OLS(values, features)
    results = model.fit()
    prediction = model.predict(results.params, features)
    return prediction
Example #14
0
    def test_include_na(self, sparse, dtype):
        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=sparse, dtype=dtype)
        exp = DataFrame({'a': [1, 0, 0],
                         'b': [0, 1, 0]},
                        dtype=self.effective_dtype(dtype))
        if sparse:
            exp = exp.apply(pd.SparseArray, fill_value=0.0)
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
        exp_na = DataFrame({nan: [0, 0, 1],
                            'a': [1, 0, 0],
                            'b': [0, 1, 0]},
                           dtype=self.effective_dtype(dtype))
        exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        if sparse:
            exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True,
                                  sparse=sparse, dtype=dtype)
        exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
                                dtype=self.effective_dtype(dtype))
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
    def binarize_data(self):
        fn = os.path.join(DATA_DIR, 'all_data_nn.csv')
        df = pd.read_csv(fn)
        df['norm_age'] = (df['age_at_ins'] - min(df['age_at_ins']))/(max(df['age_at_ins']) - min(df['age_at_ins']))
        print df.columns

        # categorical features
        cat_features = ['make', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
                        'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'nvcat']

        # binarize categorical features
        binarized_df1 = pd.get_dummies(df[cat_features])
        binarized_df2 = pd.get_dummies(df['ordcat'])
        binarized_df2.columns = ['ordcat_1', 'ordcat_2', 'ordcat_3', 'ordcat_4', 'ordcat_5', 'ordcat_6', 'ordcat_7']
        binarized_df = pd.concat([df[['rowid', 'var4', 'var5', 'var7', 'nvvar1', 'nvvar2', 'nvvar3', 'nvvar4',
                                      'response', 'ind', 'norm_age']],
                                  binarized_df1,
                                  binarized_df2],
                                 axis=1)

        # remove columns such that there n-1 features for a caterical variable with n values
        rem_list = ['make_Z', 'cat1_G', 'cat2_C', 'cat3_F', 'cat4_C', 'cat5_C', 'cat6_F', 'cat7_D',
                    'cat8_C', 'cat9_B', 'cat10_C', 'cat11_F', 'cat12_F', 'nvcat_O', 'ordcat_7']
        binarized_df = binarized_df.drop(rem_list, axis=1)
        binarized_df.to_csv('all_data_nn_binarized.csv', index=False, index_label=False)
 def transform(self, X_df):
     X_encoded = X_df
     
     #uncomment the line below in the submission
     path = os.path.dirname(__file__)
     X_encoded = X_df
     X_encoded = X_encoded.join(pd.get_dummies(X_encoded['Departure'], prefix='d'))
     X_encoded = X_encoded.join(pd.get_dummies(X_encoded['Arrival'], prefix='a'))
     X_encoded = X_encoded.drop('Departure', axis=1)
     X_encoded = X_encoded.drop('Arrival', axis=1)
 
     #data_holidays = pd.read_csv("data_holidays_2.csv")
     data_holidays = pd.read_csv(os.path.join(path, "data_holidays_2.csv"))
     X_holidays = data_holidays[['DateOfDeparture','Xmas','Xmas-1','NYD','NYD-1','Ind','Thg','Thg+1','Lab','Mem']]     
     X_encoded = X_encoded.merge(X_holidays, how='left', left_on=['DateOfDeparture'], right_on=['DateOfDeparture'], sort=False)
     
     X_encoded['DateOfDeparture'] = pd.to_datetime(X_encoded['DateOfDeparture'])
     X_encoded['year'] = X_encoded['DateOfDeparture'].dt.year
     X_encoded['weekday'] = X_encoded['DateOfDeparture'].dt.weekday
     X_encoded['week'] = X_encoded['DateOfDeparture'].dt.week
     X_encoded = X_encoded.join(pd.get_dummies(X_encoded['year'], prefix='y'))
     X_encoded = X_encoded.join(pd.get_dummies(X_encoded['weekday'], prefix='wd'))
     X_encoded = X_encoded.join(pd.get_dummies(X_encoded['week'], prefix='w'))
     X_encoded = X_encoded.drop('weekday', axis=1)
     X_encoded = X_encoded.drop('week', axis=1)
     X_encoded = X_encoded.drop('year', axis=1)
     X_encoded = X_encoded.drop('std_wtd', axis=1)
     X_encoded = X_encoded.drop('WeeksToDeparture', axis=1)        
     X_encoded = X_encoded.drop('DateOfDeparture', axis=1)     
     X_array = X_encoded.values
     return X_array
def dataPreprocessing(filename):

    data_train=pd.read_csv(filename)
    data_train,rfr = set_missing_ages(data_train)
    data_train = set_Cabin_type(data_train)

#将类别数据离散化
    dummies_Cabin=pd.get_dummies(data_train['Cabin'],prefix='Cabin')
    dummies_Embarked=pd.get_dummies(data_train['Embarked'],prefix='Embarked')
    dummies_Pclass=pd.get_dummies(data_train['Pclass'],prefix='Pclass')
    dummies_Sex=pd.get_dummies(data_train['Sex'],prefix='Sex')

#将原始的 Cabin Embarked Pclass Sex删除
    data_train.drop(['Cabin','Embarked','Pclass','Sex'],axis=1,inplace=True)

#构造新的dataFrame
    df=pd.concat([data_train,dummies_Cabin,dummies_Embarked,dummies_Pclass,dummies_Sex],axis=1)

#将 Age 与 Fare做归一化处理,利用sklearn 中的preprocessing模块
#实例化一个StandardScaler对象
    ps=preprocessing.StandardScaler()
    Age_scale_param=ps.fit(df['Age'])
    df['Age_scaled']=ps.fit_transform(df['Age'],Age_scale_param)
    Fare_scale_param=ps.fit(df['Fare'])
    df['Fare_scaled']=ps.fit_transform(df['Fare'],Fare_scale_param)

    return df,rfr
Example #18
0
def get_votes_data(votes_df):
    """creates dummies, converts dates, and gets counts for votes"""
    votes_df["date"] = pd.to_datetime(votes_df.date)
    votes_df["num_yes"] = votes_df.votes.map(lambda x: len(x.get("Yea", x.get("Aye", []))))
    votes_df["num_no"] = votes_df.votes.map(lambda x: len(x.get("No", x.get("Nay", []))))
    votes_df["num_not_voting"] = votes_df.votes.map(lambda x: len(x.get("Not Voting", [])))
    votes_df["num_present"] = votes_df.votes.map(lambda x: len(x.get("Present", [])))
    votes_df["percent_yes_D"] = votes_df.votes.map(
        lambda x: get_precent_party(x.get("Yea", x.get("Aye", [])))["countD"]
    )
    votes_df["percent_no_D"] = votes_df.votes.map(lambda x: get_precent_party(x.get("No", x.get("Nay", [])))["countD"])
    votes_df["percent_yes_R"] = votes_df.votes.map(
        lambda x: get_precent_party(x.get("Yea", x.get("Aye", [])))["countR"]
    )
    votes_df["percent_no_R"] = votes_df.votes.map(lambda x: get_precent_party(x.get("No", x.get("Nay", [])))["countR"])
    votes_df["percent_not_voting_D"] = votes_df.votes.map(
        lambda x: get_precent_party(x.get("Not Voting", []))["countD"]
    )
    votes_df["percent_not_voting_R"] = votes_df.votes.map(
        lambda x: get_precent_party(x.get("Not Voting", []))["countR"]
    )
    votes_df["percent_present_D"] = votes_df.votes.map(lambda x: get_precent_party(x.get("Present", []))["countD"])
    votes_df["percent_present_R"] = votes_df.votes.map(lambda x: get_precent_party(x.get("Present", []))["countR"])
    votes_df["is_amendment"] = votes_df.amendment.notnull()
    votes_df = pd.concat([votes_df, pd.get_dummies(votes_df.category)], axis=1)
    votes_df.drop("unknown", axis=1, inplace=True)
    votes_df = pd.concat([votes_df, pd.get_dummies(votes_df.requires)], axis=1)
    votes_df.drop("3/5", axis=1, inplace=True)
    votes_df = pd.concat([votes_df, pd.get_dummies(votes_df.session)], axis=1)
    votes_df.drop("2002", axis=1, inplace=True)
    return votes_df
Example #19
0
def model1(title):
    df = pd.read_csv('./data/train.csv')
    df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    df['Gender']= df['Sex'].map({'female':0, 'male': 1}).astype(int)
    age_mean = df['Age'].mean()
    mode_embarked = mode(df['Embarked'])[0][0]
    df['Embarked'] = df['Embarked'].fillna(mode_embarked)
    df['Age'] = df['Age'].fillna(age_mean)
    df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)
    df = df.drop(['Sex', 'Embarked'], axis=1)
    cols = df.columns.tolist()
    cols = [cols[1]] + cols[0:1] + cols[2:]
    df = df[cols]
    train_data = df.values
    #rf(train_data[0:, 2:], train_data[0:,0], train_data[0:, 2:], train_data[0:,0])
    #model = RandomForestClassifier(n_estimators=100)
    #model = model.fit(train_data[0:, 2:], train_data[0:,0])

    df_test = pd.read_csv('./data/test.csv')
    df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    df_test['Gender']= df_test['Sex'].map({'female':0, 'male': 1}).astype(int)
    age_mean = df_test['Age'].mean()
    df_test['Age'] = df_test['Age'].fillna(age_mean)
    fare_means = df.pivot_table('Fare', index='Pclass', aggfunc='mean')
    df_test['Fare'] = df_test[['Fare', 'Pclass']].apply(lambda x: fare_means[x['Pclass']]
        if pd.isnull(x['Fare']) else x['Fare'], axis=1)
    df_test = pd.concat([df_test, pd.get_dummies(df_test['Embarked'], prefix='Embarked')], axis=1)
    df_test = df_test.drop(['Sex', 'Embarked'], axis=1)
    test_data = df_test.values
    if title.rf:
        rf(train_data[0:, 2:], train_data[0:,0], train_data[0:, 2:], train_data[0:,0])
    if title.mlp:
        nn(train_data[0:, 2:], train_data[0:,0], train_data[0:, 2:], train_data[0:,0])
Example #20
0
def preprocess_data(data):

    data['Title'] = data['Name'].apply(get_title)

    
    data=data.drop('Name', axis=1)

    data=data.drop('Cabin', axis=1)
    data=data.drop('Ticket', axis=1)

    # data['Age'].fillna(data['Age'].mean(), inplace=True)
    process_age(data)
    data['Age'].fillna(data['Age'].mean(), inplace=True)
    print (data.info())
    data['Fare'].fillna(data['Fare'].mean(), inplace=True)
    data['Embarked'].fillna('S', inplace=True)

    gender_dummy=pd.get_dummies(data['Sex'])
    data=pd.concat([data, gender_dummy], axis=1)
    data=data.drop('Sex', axis=1)
    data=data.drop('Title', axis=1)

    gender_dummy=pd.get_dummies(data['Embarked'])
    data=pd.concat([data, gender_dummy], axis=1)
    data=data.drop('Embarked', axis=1)

    return data
    def transform(self, X_df):
        X_encoded = X_df

        # uncomment the line below in the submission
        # path = os.path.dirname(__file__)
        X_encoded = X_df
        X_encoded = X_encoded.join(pd.get_dummies(X_encoded["Departure"], prefix="d"))
        X_encoded = X_encoded.join(pd.get_dummies(X_encoded["Arrival"], prefix="a"))
        X_encoded = X_encoded.drop("Departure", axis=1)
        X_encoded = X_encoded.drop("Arrival", axis=1)

        #   data_holidays = pd.read_csv(os.path.join(path, "data_holidays.csv"))
        #   X_holidays = data_holidays[['DateOfDeparture','Xmas','Xmas-1','NYD','NYD-1','Ind','Thg','Thg+1']]
        #   X_encoded = X_encoded.set_index(['DateOfDeparture'])
        #   X_holidays = X_holidays.set_index(['DateOfDeparture'])
        #   X_encoded = X_encoded.join(X_holidays).reset_index()

        X_encoded["DateOfDeparture"] = pd.to_datetime(X_encoded["DateOfDeparture"])
        X_encoded["year"] = X_encoded["DateOfDeparture"].dt.year
        X_encoded["weekday"] = X_encoded["DateOfDeparture"].dt.weekday
        X_encoded["week"] = X_encoded["DateOfDeparture"].dt.week
        X_encoded = X_encoded.join(pd.get_dummies(X_encoded["year"], prefix="y"))
        X_encoded = X_encoded.join(pd.get_dummies(X_encoded["weekday"], prefix="wd"))
        X_encoded = X_encoded.join(pd.get_dummies(X_encoded["week"], prefix="w"))
        X_encoded = X_encoded.drop("weekday", axis=1)
        X_encoded = X_encoded.drop("week", axis=1)
        X_encoded = X_encoded.drop("year", axis=1)
        X_encoded = X_encoded.drop("std_wtd", axis=1)
        X_encoded = X_encoded.drop("WeeksToDeparture", axis=1)
        X_encoded = X_encoded.drop("DateOfDeparture", axis=1)
        X_array = X_encoded.values
        return X_array
Example #22
0
def gridtrainfraction(trainiter, rfparams):
    ''' read in data once for grid search, clear, then again for model fit'''
    train = fractionate(trainiter, fraction=0.002)
    clf = RandomForestClassifier(**rfparams)
    grid = GridSearchCV(clf, param_grid=gridparams, scoring='log_loss', n_jobs=1)
    X_train = train.drop('hotel_cluster', axis=1)
    X = sparsify(pd.get_dummies(X_train.astype(str)))
    y = train['hotel_cluster']
    grid.fit(X,y)
    
    print(grid.best_params_)
    print(grid.grid_scores_)
    
    train = None
    X_train = None
    X = None
    y = None
    clf = None
    
    train = fractionate(trainiter, fraction=0.01)
    X_train = train.drop('hotel_cluster', axis=1)
    X = sparsify(pd.get_dummies(X_train.astype(str)))
    y = train['hotel_cluster']
    bestparams = grid.best_params_
    clf = RandomForestClassifier(**rfparams)
    clf.set_params(**bestparams)
    clf.fit(X,y)
    return clf
Example #23
0
def predictions(dataframe):
    features = dataframe[['meantempi']]
    
    dummy_unit = pd.get_dummies(dataframe['UNIT'], prefix='unit')
    dummy_hour = pd.get_dummies(dataframe['Hour'], prefix='hour')
    date_fn_input = dataframe['DATEn'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    day_of_wk = date_fn_input.apply(lambda x: datetime.strftime(x, '%w')) # isolate day of the week for dummy variable
    dummy_day_of_wk = pd.get_dummies(day_of_wk, prefix='day_of_wk')
    
    features = features.join(dummy_unit).join(dummy_hour).join(dummy_day_of_wk)
    values = dataframe['ENTRIESn_hourly']

    features_array = features.values 
    values_array = values.values
    print pd.DataFrame(features_array).head()
    
    means, std_devs, normalized_features_array = normalize_features(features_array)

    # Perform ordinary least squares regression
    predictions_OLS = norm_intercept + np.dot(normalized_features_array, norm_params)
    # print pd.DataFrame(predictions).head()

    # Perform gradient descent
    
        

    # Perform stochastic gradient descent
    '''norm_intercept, norm_params = linear_regression_SGD(normalized_features_array, values_array)
    intercept, params = recover_params(means, std_devs, norm_intercept, norm_params)
    predictions = intercept + np.dot(features_array, params)'''
    # The following line would be equivalent:

    return predictions_OLS, predictions_GD, predictions_SGD
Example #24
0
    def _addFeatures(self, df):
        
        #Add additional features to train/test dataframe
        if len(df)>0:

            df = df.reset_index()

            df['hour'] = df.apply(lambda row: row['pick_date'].hour, axis=1)
            df['min']  = df.apply(lambda row: row['pick_date'].hour * 60 + row['pick_date'].minute, axis=1)
            df['weekend'] = df.apply(lambda row: f_is_weekend(row['pick_date']), axis=1) #binary feature
            tod = df.apply(lambda row: f_tod(row['pick_date']), axis=1) #morning, midday, afternoon, envening, night
            weekday = df.apply(lambda row: row['pick_date'].weekday(), axis=1)  #0-6

            #dummify categorical features (drop one dummy and append the rest to the resulting dataframe)
            #append referent values to interim data to make sure all columns are present in the output (even if dataset does not
            #contain all values in tod/weekday column)
            ref_vals = ['morning', 'midday', 'afternoon', 'evening', 'night']
            tod = pd.Series(list(itertools.chain(tod, ref_vals))) 
            tod = pd.get_dummies(tod, prefix = 'tod')
            df = df.join(tod.ix[:len(df), ['tod_'+i for i in ref_vals[1:]]])

            ref_vals = range(0,7)
            weekday = pd.Series(list(itertools.chain(weekday, ref_vals)))
            weekday = pd.get_dummies(weekday, prefix = 'weekday')
            df = df.join(weekday.ix[:len(df), ['weekday_'+str(i) for i in ref_vals[1:]]])

        return df
def Onehot_Encoding_DD(New_DS, Train_DS, y):

    #cos_dist_T = Get_similarity_matrix(Train_DS,y)
    cos_dist_T = Get_similarity_matrix_DD2(New_DS)

    #one hot encoding for DepartmentDescription
    print("one hot encoding sales - DepartmentDescription at Time: %s" %(tm.strftime("%H:%M:%S")))

    dummies = pd.get_dummies(New_DS['DepartmentDescription'])
    DeptDesc_cols = [ 'DD'+"_buy1_"+str(s) for s in list(dummies.columns)]

    sim_dd_buy = cos_dist_T
    sim_dd_buy.columns = DeptDesc_cols
    sim_dd_buy = sim_dd_buy.reset_index()

    cols = ['VisitNumber','ScanCount','DepartmentDescription']
    New_DS = New_DS[cols].merge(sim_dd_buy,left_on='DepartmentDescription',right_on='index',how='left')
    New_DS = New_DS.drop(['index'], axis = 1)

    #get "buying" qty for  DepartmentDescription
    Temp_Scan = pd.DataFrame()
    Temp_Scan['ScanCount'] = New_DS ['ScanCount']
    Temp_Scan['ScanCount'] = np.where(New_DS ['ScanCount']>= 0,New_DS ['ScanCount'],0).astype(int)

    for i in range(len(DeptDesc_cols)):
        New_DS[DeptDesc_cols[i]] = New_DS[DeptDesc_cols[i]] * Temp_Scan ['ScanCount']

    del sim_dd_buy
    ##----------------------------------------------------------------------------------------------------------------##

    print("one hot encoding return - DepartmentDescription at Time: %s" %(tm.strftime("%H:%M:%S")))

    #one hot encoding for DepartmentDescription - Return
    dummies = pd.get_dummies(New_DS['DepartmentDescription'])
    DeptDesc_cols = [ 'DD'+"_ret1_"+str(s) for s in list(dummies.columns)]

    sim_dd_ret = cos_dist_T
    sim_dd_ret.columns = DeptDesc_cols
    sim_dd_ret = sim_dd_ret.reset_index()

    New_DS = New_DS.merge(sim_dd_ret,left_on='DepartmentDescription',right_on='index',how='left')
    New_DS = New_DS.drop(['index'], axis = 1)

    #get "return" qty for  DepartmentDescription
    Temp_Scan['ScanCount'] = New_DS ['ScanCount']
    Temp_Scan['ScanCount'] = np.where(New_DS ['ScanCount'] < 0,New_DS ['ScanCount']*-1,0).astype(int)

    for i in range(len(DeptDesc_cols)):
        New_DS[DeptDesc_cols[i]] = New_DS[DeptDesc_cols[i]] * Temp_Scan ['ScanCount']

    del sim_dd_ret
    ##----------------------------------------------------------------------------------------------------------------##
    New_DS = New_DS.drop(['ScanCount','DepartmentDescription'], axis = 1)
    New_DS = New_DS.groupby('VisitNumber').sum().reset_index()

    print(np.shape(New_DS))

    #pd.DataFrame(New_DS).to_csv(file_path+'New_DS.csv')

    return New_DS
def transform_big(data):

    data = data.copy()
    shot_type = pd.get_dummies(data["Shot Type"].apply(shot))
    data["Shot Dist."] = data["Shot Dist."].apply(lambda x : x.replace("ft.", ""))
    data["Shot Dist."] = data["Shot Dist."].apply(lambda x : 0 if x== "" else float(x))
    
    # shot_clock = data["Shot Clock"].apply(lambda x: 0 if x == "" else float(x))
    # touch_time = data["Touch Time"].apply(lambda x: float(x))
    # drib = data["Drib."].apply(lambda x: int(x))
    data["Def Dist."] = data["Def Dist."].apply(lambda x: float(x))

    # def_dist_c = pd.get_dummies(data["Def Dist."].apply(def_dist))
    
    player_c = pd.get_dummies(data["Player"])

    shot_dist_c = pd.get_dummies(data["Shot Dist."].apply(shot_dist))
    
    con = [player_c, shot_type , data["Def Dist."],
            shot_dist_c, data["Shot Dist."],(data["Made?"]=="Yes").astype(int)]
    
#     con = [player_c, shot_type, shot_clock, touch_time, drib, 
#            shot_dist_c, data["Shot Dist."],(data["Made?"]=="Yes").astype(int)]
        
    new_shot_chart = pd.concat(con , axis=1)

    pred = player_c.columns[:len(player_c)].tolist()+ ['Shot Dist.', 'Def Dist.', 'else', 'jump', 'layup', 'Made?']

    return new_shot_chart[pred]
def trans2vect(data):
    item_vec = data.reindex(columns=orin_name)
    # dummy
    capsule = pd.get_dummies(data.CAPSULE_TEXT, prefix='cap_')
    genre = pd.get_dummies(data.GENRE_NAME, prefix='gen_')
    large_area = pd.get_dummies(data.large_area_name, prefix='larg_area_')
    ken_name = pd.get_dummies(data.ken_name, prefix='ken_')
    small_name = pd.get_dummies(data.small_area_name, prefix='small_area_')
    # time
    dispfrom = pd.to_datetime(data.DISPFROM)
    item_vec['dispfrom'] = [x.dayofyear for x in dispfrom]
    dispend = pd.to_datetime(data.DISPEND)
    item_vec['dispend'] = [x.dayofyear for x in dispend]
    validfrom = pd.to_datetime(data.VALIDFROM)
    item_vec['validfrom'] = [x.dayofyear for x in validfrom]
    validend = pd.to_datetime(data.VALIDEND)
    item_vec['validend'] = [x.dayofyear for x in validend]
    # join
    item_vec = item_vec.join([capsule, genre, large_area, ken_name, small_name])
    item_vec.index = data.COUPON_ID_hash
    item_vec = item_vec.fillna(0)
    # feature engineering
    item_vec.DISCOUNT_PRICE = 1 / np.log10(item_vec.DISCOUNT_PRICE)
    item_vec.CATALOG_PRICE = 1 / np.log10(item_vec.CATALOG_PRICE)
    item_vec.PRICE_RATE = (item_vec.PRICE_RATE ** 2) / (100 * 100)
    scale_name = [u'DISPPERIOD', u'VALIDPERIOD',u'dispfrom', u'dispend', u'validfrom', u'validend']
    for i in scale_name:
        item_vec[i] = scale(item_vec[i])
    return item_vec
Example #28
0
def get_data_frame_with_dummies(users):
    users_ref = users.copy()
    base_dummies = None
    categories = {'gender': ['male', 'female'], 'education': ['overGraduate', 'university', 'underHigh'],
                  'income': ['100', '200', '300', '400', '500', '1200more'], 'job': ['officer', 'student', 'etc'],
                  'marriage': ['married', 'single'], 'religion': ['buddhist', 'none', 'christian', 'romanCatholicism']}
    age_bins = [10, 20, 30, 40, 50, 60, 70]
    numChild_bins = [0, 1, 10]
    for label_type in users_ref.columns:
        temp_dummies = None
        if label_type == 'age':
            temp_dummies = pd.get_dummies(pd.cut(users_ref[label_type], age_bins, right=False), prefix=label_type)
        elif label_type == 'numberOfChildren':
            temp_dummies = pd.get_dummies(pd.cut(users_ref[label_type], numChild_bins, right=False), prefix=label_type)
        elif label_type == 'residence':
            continue
        else:
            users_ref[label_type + "_cat"] = pd.Categorical(users_ref[label_type],
                                                            categories=categories.get(label_type))
            temp_dummies = pd.get_dummies(users_ref[label_type + "_cat"], prefix=label_type)

        if base_dummies is None:
            base_dummies = temp_dummies
        else:
            base_dummies = pd.concat([base_dummies, temp_dummies], axis=1)
    label_nums = base_dummies.sum()
    label_rates = label_nums / float(len(users_ref))
    return base_dummies, label_nums, label_rates
Example #29
0
def procc_testset(clf, age_scale_param, fare_scale_param):
    import sklearn.preprocessing as preprocessing
    import numpy as np
    scaler = preprocessing.StandardScaler()
    data_test = pd.read_csv("test.csv")
    data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0
    # 接着我们对test_data做和train_data中一致的特征变换
    # 首先用同样的RandomForestRegressor模型填上丢失的年龄
    tmp_df = data_test[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
    null_age = tmp_df[data_test.Age.isnull()].as_matrix()
    # 根据特征属性X预测年龄并补上
    X = null_age[:, 1:]
    predictedAges = rfr.predict(X)
    data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges

    data_test = set_Cabin_type(data_test)
    dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
    dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
    dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
    dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')

    df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
    df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
    df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param)
    df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param)
    df_test.head(8)
    ############
    test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    predictions = clf.predict(test)
    result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived': predictions.astype(np.int32)})
    result.to_csv("logistic_regression_predictions.csv", index=False)
Example #30
0
data['thalassemia'][data['thalassemia'] == 0] = 'reversable defect'

x = data.iloc[:, 0:13].values
x = pd.DataFrame(x)
y = data.iloc[:, 13].values

x.columns = [
    'age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol',
    'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
    'exercise_induced_angina', 'st_depression', 'st_slope',
    'num_major_vessels', 'thalassemia'
]
x = pd.get_dummies(x,
                   columns=[
                       'sex', 'chest_pain_type', 'fasting_blood_sugar',
                       'rest_ecg', 'exercise_induced_angina', 'st_slope',
                       'thalassemia'
                   ],
                   drop_first=True)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

#Feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
Example #31
0
    model.save('best_lstm_model.h5') 
    
    del history
    del model
    gc.collect()


"""### Evaluation"""

predY = np.average(submission_predictions, axis = 0, weights = [2**i for i in range(len(submission_predictions))])

# plot precision-recall-curve
precision = dict()
recall = dict()

y_test_dummies = pd.get_dummies(testY, drop_first=False).values
for i in range(3):
    precision[i], recall[i], _ = precision_recall_curve(y_test_dummies[:, i], predY[:, i])
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve")
plt.show()
plt.savefig('prc.png')

# plot ROC
import seaborn as sns
class_to_label_map = ['normal', 'covid', 'pneumonia']
        else:
            return 24

    else:
        return Age


train['Age'] = train[['Age', 'Pclass']].apply(impute_age, axis=1)
sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis')

train.drop('Cabin', axis=1, inplace=True)
train.head()
train.dropna(inplace=True)
train.info()
sex = pd.get_dummies(train['Sex'], drop_first=True)
embark = pd.get_dummies(train['Embarked'], drop_first=True)
train.drop(['Sex', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)
train = pd.concat([train, sex, embark], axis=1)
train.head()

# building logistic regression model

# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived',
                                                               axis=1),
                                                    train['Survived'],
                                                    test_size=0.30,
                                                    random_state=101)
full.loc[full['build_year']==1,'build_year']=np.nan
full.loc[full['build_year']==20,'build_year']=2000
full.loc[full['build_year']==215,'build_year']=2015
full.loc[full['build_year']==3,'build_year']=np.nan
full.loc[full['build_year']==2,'build_year']=np.nan
full.loc[full['build_year']==71,'build_year']=np.nan
full.loc[full['build_year']==4965,'build_year']=np.nan
#对sub_area进行重新划分
# full.loc[full['sub_area']=='']

full.drop(["id", "timestamp", "price_doc"], axis=1,inplace=True)



#之前是289列,get_dummies之后是451列
full=pd.get_dummies(full,columns=col_object)



#模型调参
def get_model(estimator, parameters, X_train, y_train, scoring):
    model = GridSearchCV(estimator, param_grid=parameters, scoring=scoring)
    model.fit(X_train, y_train)
    return model.best_estimator_

#
# X=full[full.floor.notnull()].drop('floor',axis=1)
# y=full[full.floor.notnull()].floor
#
# X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=2017)
# XGB = xgb.XGBRegressor(max_depth=4, seed= 2017)
Example #34
0
y = dataset[:, -1]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)
#print(pd.DataFrame(y_train).hist(bins=4))
pd.Series(y_train).value_counts(bins=4)
pd.Series(y_test).value_counts(bins=4)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

#one-hot encoded output classes
y_train = pd.get_dummies(y_train).values
y_test = pd.get_dummies(y_test).values
#test1=test.values
#X_test=test1[:,0:-1]
#y_test=test1[:,-1]

# Designing of the model
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import Input
from keras.models import Model
from keras.optimizers import Adadelta, SGD, Adam, RMSprop

input_img = Input(shape=(36, ))
Example #35
0
def prepare_data():
    # Copied wti200's kernel: from https://www.kaggle.com/wti200/deep-neural-network-for-starters-r
    excluded = get_excluded()
    df_train = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
    df_test = pd.read_csv("../input/test.csv", parse_dates=['timestamp'])

    #-------------------------------------
    # Note that the following is essential to get good performance:
    # You can produce these pkl by using the two kernels with 5 fold non-shuffle traning, as we usually did in stacking
    #-------------------------------------
    # https://www.kaggle.com/schoolpal/lgbm-lb-0-3093-0-3094
    # https://www.kaggle.com/schoolpal/modifications-to-reynaldo-s-script

    # (xgb_train,xgb_test)=pickle.load(open('xgb_predicted.pkl'))
    # (xgb_train_log,xgb_test_log)=pickle.load(open('xgb_predicted_log.pkl'))
    # (lgb_train,lgb_test)=pickle.load(open('lgb_predicted.pkl'))

    # df_train['xgb_score']=xgb_train
    # df_train['log_xgb_score']=np.log(xgb_train)
    # df_train['lgb_score']=lgb_train
    # df_train['lgb_score_log']=np.log(lgb_train)
    ## df_train['log_xgb_score']=xgb_train_log
    ## df_train['log_xgb_score_log']=np.log(xgb_train_log)
    # df_test['xgb_score']=xgb_test
    # df_test['log_xgb_score']=np.log(xgb_test)
    # df_test['lgb_score']=lgb_test
    # df_test['lgb_score_log']=np.log(lgb_test)
    ## df_test['log_xgb_score']=xgb_test_log
    ## df_test['log_xgb_score_log']=np.log(xgb_test_log)

    # Magic number from Andy's script (Louis?)
    df_train['price_doc'] *= 0.969

    full_sq = df_train.full_sq.copy()
    full_sq[full_sq < 5] = np.NaN

    price_sq = df_train.price_doc / full_sq
    #Remove the extreme prices, took from someone's kernel (sry)
    df_train = df_train[(price_sq < 600000) & (price_sq > 10000)]
    price_sq = price_sq[(price_sq < 600000) & (price_sq > 10000)]

    y_train = df_train.price_doc
    df_train.drop(['price_doc'], inplace=True, axis=1)
    num_train = df_train.shape[0]
    da = pd.concat([df_train, df_test])
    da = da.reset_index(drop=True)
    '''
    The feature enginering part, most of the FE were took from other peole's kernel.
    last_days method adds the mean of full_sq for all the house sold in last 30 days.
    This feature was motivated from my autoregression model for monthly prices. What does this feature capture? I tried daily sum of full_sq which clearly indicates the supply and demand. However, the local CV results of monthly price prediction actually prefer mean! I think  this feature somehow captured the supply and demand for luxury or economic properties.
'''
    da = last_days(da)
    # These two features are only necessary as I removed the outlier feature values (> 4 SD) for all features, but these two are important to keep.
    da['build_year1'] = ((da['build_year'] == 1) &
                         (da.product_type == 'OwnerOccupier')).astype(int)
    da['build_year0'] = ((da['build_year'] == 0) &
                         (da.product_type == 'OwnerOccupier')).astype(int)

    # Fill some missing values based on location (Bhavesh Ghodasara's idea for
    # identify location)
    da = fill_years(da)
    da = fill_maxfloor(da)

    # Not necessary, I just fix it in order to calculate price per square meter for the sample weights
    da.loc[da['life_sq'] < 5, 'life_sq'] = np.NaN
    da.loc[da['full_sq'] < 5, 'full_sq'] = np.NaN

    # 0.7 come from the mean ratio (0.65?) between full_sq and life_sq,0.65 also works
    da['life_sq'] = np.where(da.life_sq.isnull(), da.full_sq * 0.7, da.life_sq)
    da['build_year'] = np.where(
        (da.build_year > 1690) & (da.build_year < 2020), da.build_year, np.NaN)
    da['max_floor'] = np.where(da.max_floor < da.floor, da.floor + 1,
                               da.max_floor)
    da['material'] = da['material'].astype(str)
    da.loc[da.state == 33, 'state'] = 3

    to_remove = []
    product_types = pd.factorize(da.product_type)[0]
    product_types_string = da.product_type.copy()

    da['month'] = da.timestamp.dt.year.astype(str)

    # The year_month feature was added to nullify  the effect of
    # "year_month" as I set the year_month of the test data to be NaN
    # I hope to nullify any effect of time. This is equivalent to say that we don't know the time for test data.
    # Any time effect must be learned from macro feature

    da['year_month'] = da.timestamp.dt.year
    da['year_month'] = (da['year_month'] * 100 + da.timestamp.dt.month)
    da.loc[da['year_month'] > 201506, 'year_month'] = np.NaN
    da['year_month'] = da['year_month'].astype(str)

    df_cat = None
    for c in da.columns:
        if da[c].dtype == 'object':
            oh = pd.get_dummies(da[c], prefix=c)

            if df_cat is None:
                df_cat = oh
            else:
                df_cat = pd.concat([df_cat, oh], axis=1)
            to_remove.append(c)
    da.drop(to_remove, inplace=True, axis=1)
    # Remove rare one hot encoded features
    to_remove = []
    if df_cat is not None:
        sums = df_cat.sum(axis=0)
        to_remove = sums[sums < 200].index.values
        df_cat = df_cat.loc[:, df_cat.columns.difference(to_remove)]
        da = pd.concat([da, df_cat], axis=1)
    if excluded is not None:
        for c in excluded:
            if c in da.columns:
                da.drop([c], inplace=True, axis=1)
    # These additional features are taken from
    # https://www.kaggle.com/wti200/deep-neural-network-for-starters-r
    da['na_count'] = da.isnull().sum(axis=1)
    da['rel_floor'] = da.floor / da.max_floor
    da['diff_floor'] = da.max_floor - da.floor
    da['rel_kitchen_sq'] = da.kitch_sq - da.full_sq
    da['rel_life_sq'] = da.life_sq / da.full_sq
    da['rel_kitch_life'] = da.kitch_sq / da.life_sq
    da['rel_sq_per_floor'] = da.full_sq / da.floor
    da['diff_life_sq'] = da.full_sq - da.life_sq
    da['building_age'] = da.timestamp.dt.year - da.build_year

    da['new_house_own'] = (
        (da['building_age'] <= 0) &
        (product_types_string == 'OwnerOccupier')).astype(int)
    da['old_house_own'] = (
        (da['building_age'] > 0) &
        (product_types_string == 'OwnerOccupier')).astype(int)
    # Macro features, finally!!!
    # The unemployment info for 2016 was missing. So the unemployment rate were taken from OCED website
    # The original unemployment data is useful, but OCED's data is better (LB score)
    # These macro features are selected from my autoregresion time series model
    # for the monthly mean prices based on the local CV results. "eurrub" and "brent" for Investment properties, and "unemployment" for OwerOccupier.
    macro_cols = ['timestamp', 'brent', 'eurrub', 'unemployment']
    macro = pd.read_csv('../input/macro.csv', parse_dates=['timestamp'])
    # Load the OCED unemployment
    # macro=macro_lib.fix(macro)
    macro = macro.loc[:, macro_cols]
    da = da.join(macro.set_index('timestamp'), on='timestamp')
    da[da == np.inf] = np.NaN
    if 'index' in da.columns:
        da.drop(['index'], inplace=True, axis=1)
    # Give tax-purpose properties a very low sample weights
    sample_weights = bad_weights(df_train, y_train, price_sq)
    train = da[:num_train].drop(['timestamp', 'id'], axis=1)
    test = da[num_train:].drop(['timestamp', 'id'], axis=1)
    # identify the binary features for excluding them from scaling
    bin_inds = []
    for c in train.columns:
        if train.loc[:, c].unique().shape[0] == 2 and train.loc[:, c].unique(
        ).sum() == 1:
            bin_inds.append(train.columns.get_loc(c))
    return train, test, y_train, da[num_train:].id, bin_inds, sample_weights
Example #36
0
def basic_preprocess(train_complete, 
                     test_complete, 
                     out_column, 
                     drop_columns=None,
                     forced_categorical = None, 
                     forced_numeric = None, 
                     columns_to_normalize = None,
                     use_labeler = None,
                     manual_processing = None,
                     seed=42,
                     perc=10):
  complete_features = pd.concat([train_complete, test_complete], sort=False).reset_index(drop=True)
  train = train_complete.copy()
  test = test_complete.copy()

  normalize_output = columns_to_normalize and out_column in columns_to_normalize
  if normalize_output:
    columns_to_normalize.remove(out_column)

  if use_labeler:
    if not columns_to_normalize:
      columns_to_normalize = []
    for column in use_labeler:
      if column in columns_to_normalize:
        columns_to_normalize.remove(column)

  convert_dict = {}
  if forced_categorical:
    for column in forced_categorical:
      convert_dict[column] = 'str'
  
  if forced_numeric:
    for column in forced_numeric:
      convert_dict[column] = 'float64'

  train = train.astype(convert_dict)
  test = test.astype(convert_dict) 

  if drop_columns:
    train.drop(drop_columns, axis=1, inplace=True)
    test.drop(drop_columns, axis=1, inplace=True)

  train_data = np.array(train[out_column])

  if normalize_output:
    normalize, denormalize = transform_distribution(train_data)
  else:
    normalize = lambda x: x
    denormalize = lambda x: x
    
  y = np.array(normalize(train_data))

  train_features = train.drop([out_column], axis=1)
  features = pd.concat([train_features, test], sort=False).reset_index(drop=True)
  
  impute_with_mode(features)

  numerics = list(features.select_dtypes(include=[np.number]).columns.values)
  if len(numerics) >= 2:
    imp = IterativeImputer(max_iter=10, sample_posterior=False, random_state=seed)
    imp.fit(features[numerics])
    features[numerics] = imp.transform(features[numerics])
  elif numerics:
    impute_with_median(features)

  if use_labeler:
    labeler = LabelEncoder()
    for column in use_labeler:
      features[column] = labeler.fit_transform(features[column])
  
  final_features = pd.get_dummies(features).reset_index(drop=True)
  if columns_to_normalize:
    normalize_columns(final_features, columns_to_normalize)

  if manual_processing:
    final_features = manual_processing(final_features, complete_features)
  
  X = final_features.iloc[:len(y), :]
  X_sub = final_features.iloc[len(X):, :]

  #print('selecting relevant features')
  #X, X_sub = select_features(X, y, X_sub, final_features.columns, perc=perc)

  return X, y, X_sub, denormalize
Example #37
0
# Box Cox Transformation of (highly) skewed features
# We use the scipy function boxcox1p which computes the Box-Cox transformation of  1+x .
# Note that setting  λ=0  is equivalent to log1p used above for the target variable.
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(
    skewness.shape[0]))

from scipy.special import boxcox1p

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    # all_data[feat] += 1
    all_data[feat] = boxcox1p(all_data[feat], lam)
# Getting dummy categorical features
all_data = pd.get_dummies(all_data)
print(all_data.shape)
# Getting the new train and test sets.
train = all_data[:ntrain]
test = all_data[ntrain:]

#Validation function
n_folds = 5


def rmsle_cv(model):
    kf = KFold(
        n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(
        model, train.values, y_train, scoring="neg_mean_squared_error", cv=kf))
    print("rmse", rmse)
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
        
train_data = pd.read_csv("C:/01_Projects/09_CriticalFormulasandTools/PythonScripts/TitanicData/train.csv")
train_data.head()

test_data = pd.read_csv("C:/01_Projects/09_CriticalFormulasandTools/PythonScripts/TitanicData/test.csv")
test_data.head()

y = train_data["Survived"]

features = ["Pclass", "Sex", "Fare", "Age"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
X = my_imputer.fit_transform(X)
X_test = my_imputer.fit_transform(X_test)

model1 = GaussianNB()
model1.fit(X, y)
model2 = RandomForestClassifier(max_depth=15, n_estimators=100, bootstrap=False, max_features= 'sqrt', min_samples_leaf=4, min_samples_split=10)
Example #39
0
# Cat conversion


for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

y=train['y'] 
train.drop(['y'],inplace=True,axis=1)       
combine=pd.concat([train,test])
columns=['X1','X2','X3','X4','X5','X6','X8']
for column in columns:
    temp=pd.get_dummies(pd.Series(combine[column]))
    combine=pd.concat([combine,temp],axis=1)
    combine= combine.drop([column], axis=1)
        
# Define some useful functions

train=combine[:train.shape[0]]
test=combine[train.shape[0]:] 


def df_column_uniquify(df):
    df_columns = df.columns
    new_columns = []
    for item in df_columns:
        counter = 0
        newitem = item
Example #40
0
test_path = '~/Downloads/hacker_rank/Dataset/Test.csv'
raw_test_df = pd.read_csv(test_path)

index_column = 'Employee_ID'
train_index = raw_train_df.pop(index_column)
test_index = raw_test_df.pop(index_column)

# Merging both train and test
df = pd.concat([raw_train_df, raw_test_df], ignore_index=True)

# Categorical column
categorical_columns = [
    'Gender', 'Relationship_Status', 'Hometown', 'Unit',
    'Decision_skill_possess', 'Compensation_and_Benefits'
]
df = pd.get_dummies(df, columns=categorical_columns)

# print(df.isna().sum())
# Imputation for Time_of_service
# Time_of_service is related to Time_since_promotion
ptable = df.pivot_table(values='Time_of_service',
                        index='Time_since_promotion',
                        aggfunc=np.mean)


def get_element(x):
    index = int(x['Time_since_promotion'])
    return ptable.loc[index].values[0]


df['Time_of_service'].fillna(df[df['Time_of_service'].isnull()].apply(
Example #41
0
def display_stacked_cat_bar(df,
                            groupby,
                            on,
                            order=None,
                            unit=None,
                            palette=None,
                            horizontal=True,
                            figsize=(11, 11)):
    """
    Displays a stacked bar plot given two categorical variables
    :param df: DataFrame to display data from
    :param groupby: Column name by which bars would be grouped
    :param on: Column name of the different bar blocks
    :param order: Order in which to draw the bars by
    :param unit: Scale to which unit
    :param palette: Color palette to use for drawing
    :param horizontal: Horizontal or vertical barplot
    :param figsize: Figure size
    :return: matplotlib.Axis object
    """

    # Create a binary dataframe
    stacked_bar_df = pd.concat([df[groupby], pd.get_dummies(df[on])], axis=1)
    bins = list(stacked_bar_df.columns[1:])
    stacked_bar_df = stacked_bar_df.groupby(groupby)[bins].sum().reset_index()

    if order:
        if not isinstance(order, list):
            raise ValueError('"order" must be a list')
        if set(order) != set(bins):
            raise ValueError(
                '"order" iterable must contain all possible values: {}'.format(
                    str(bins)))

        stacked_bar_df = stacked_bar_df[[groupby] + order]
        bins = order

    # Scale if given unit
    if unit:
        # Calculate total
        stacked_bar_df['total'] = stacked_bar_df[bins].sum(axis=1)

        # Scale
        for bin_label in bins:
            stacked_bar_df[bin_label] /= stacked_bar_df['total']
            stacked_bar_df[bin_label] *= unit

        # Drop irrelevant 'total' column
        stacked_bar_df = stacked_bar_df.iloc[:, :-1]

    # Cumsum row wise
    for idx in range(1, len(bins)):
        stacked_bar_df[bins[idx]] = stacked_bar_df[bins[idx]] + stacked_bar_df[
            bins[idx - 1]]

    # Get relevant palette
    if palette:
        palette = palette[:len(bins)]
    else:
        palette = sns.color_palette()[:len(bins)]

    # Plot
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111)

    if horizontal:
        for color, bin_label in reversed(list(zip(palette, bins))):
            sns.barplot(y=groupby,
                        x=bin_label,
                        data=stacked_bar_df,
                        color=color,
                        label=bin_label,
                        ax=ax)
    else:
        for color, bin_label in reversed(list(zip(palette, bins))):
            sns.barplot(x=groupby,
                        y=bin_label,
                        data=stacked_bar_df,
                        color=color,
                        label=bin_label,
                        ax=ax)

    ax.legend(bbox_to_anchor=(1.04, 1), loc='upper left')

    if unit:
        if horizontal:
            ax.set(xlim=(0, unit))
        else:
            ax.set(ylim=(0, unit))

    if horizontal:
        ax.set(xlabel='')
    else:
        ax.set(ylabel='')

    return ax
Example #42
0
def cleanpy(cols, changetype, encodecol, scaling, scalingcol, targetcol,
            dftest, cleandatapath, rawdatapath):
    import pandas as pd
    import numpy as np
    from sklearn import preprocessing
    import os
    cols = cols
    changetype = changetype
    encodecol = encodecol
    scaling = scaling
    scalingcol = scalingcol
    targetcol = [targetcol]
    dftest = ""

    df = pd.read_csv(rawdatapath)
    #feature scaling
    if (scalingcol[0] != "none"):
        if scaling == 'standarization':
            for feature in scalingcol:
                df[feature] = (df[feature] -
                               df[feature].mean()) / (df[feature].std())
        else:
            x = df[scalingcol].values  #returns a numpy array
            min_max_scaler = preprocessing.MinMaxScaler()
            x_scaled = min_max_scaler.fit_transform(x)
            df[scalingcol] = x_scaled

    #encoding

    le = preprocessing.LabelEncoder()
    if (encodecol[0] != "none"):
        if changetype == "labelencode":
            featurex = df[encodecol]
            featurex = featurex.apply(le.fit_transform)
            features = featurex.columns
            for feature in features:
                df.drop([feature], axis=1, inplace=True)
                df = pd.concat([df, featurex[feature]], axis=1)
        else:
            dummy = pd.get_dummies(df[encodecol])
            df = pd.concat([df, dummy], axis=1)
            df.drop(encodecol, axis=1, inplace=True)
    if df[df[targetcol].columns[0]].dtype == object:
        featurex = df[targetcol]
        featurex = featurex.apply(le.fit_transform)
        features = featurex.columns
        for feature in features:
            df.drop([feature], axis=1, inplace=True)
            df = pd.concat([df, featurex[feature]], axis=1)

    # drop columns
    if (cols[0] != "none"):
        df = df.drop(cols, axis=1)

    #  mandatory cleaning

    # removing rows having null values
    df.dropna(inplace=True)
    # try to convert all non-numeric values to numeric if possible
    df = df.infer_objects()
    # removing columns having object type values as it will create problem in model creation
    removecol = df.select_dtypes(include=['object']).columns
    df.drop(labels=removecol, axis=1, inplace=True)

    #test data creation
    if dftest == "":
        msk = np.random.rand(len(df)) < 0.75
        dftrain = df[msk]
        dftest = df[~msk]
    else:
        dftrain = df
    #target variable seperation
    ytrain = pd.DataFrame(dftrain[targetcol])
    ytest = pd.DataFrame(dftest[targetcol])
    dftrain.drop(targetcol, axis=1, inplace=True)
    dftest.drop(targetcol, axis=1, inplace=True)

    dftrain.to_csv(cleandatapath + "dftrain.csv", index=None)
    dftest.to_csv(cleandatapath + "dftest.csv", index=None)
    ytrain.to_csv(cleandatapath + "ytrain.csv", index=None)
    ytest.to_csv(cleandatapath + "ytest.csv", index=None)
Example #43
0
def read_compas(filename=os.path.join(
    conf.datadir, "compas-analysis/compas-scores-two-years.csv"),
                smlfeatures=False,
                return_all=False,
                single_S=False):  #read compas dataset file (numeric ver)
    lines = [
        line for line in open(filename, "r").readlines()
        if line.find("?") == -1
    ]
    fo = open(filename, "w")
    for line in lines:
        fo.write(line)
    fo.close()
    #pd.set_option("display.max_rows", 100)
    #pd.set_option("display.max_colwidth", 100)
    #print dir(pd)
    data = pd.read_csv(filename, sep=',')

    int_values = [
        "age", "juv_fel_count", "decile_score", "juv_misd_count",
        "juv_other_count", "v_decile_score", "priors_count"
    ]  #,"is_recid"
    #string_values = ["sex","race","two_year_recid","c_charge_degree","c_charge_desc"]
    string_values = [
        "sex", "two_year_recid", "type_of_assessment", "v_type_of_assessment"
    ]  #,"r_charge_desc"]
    date_values = [
        "c_jail_in", "c_jail_out", "c_offense_date", "screening_date",
        "in_custody", "out_custody"
    ]

    my_attrs = []
    for int_val in int_values:
        my_attrs.append(data[int_val])
    for string_val in string_values:
        my_attrs.append(
            pd.get_dummies(data[string_val],
                           prefix=string_val,
                           drop_first=True))
    for date_val in date_values:
        temp = pd.to_datetime(data[date_val])
        t_min, t_max = min(temp), max(temp)
        my_attrs.append((temp - t_min) / (t_max - t_min))
    new_data = pd.concat(my_attrs, axis=1)
    new_data["African-American"] = (data["race"] == "African-American")
    new_data = new_data.dropna()
    if return_all:
        return new_data
    new_data.insert(0, "intercept", 1)

    corr_akey = []
    for akey in new_data.keys():
        corr_akey.append((np.corrcoef(new_data[akey],
                                      new_data["two_year_recid_1"])[0,
                                                                    1], akey))

    if single_S:
        S_keys = ["sex_Male"]
    else:
        S_keys = ["sex_Male", "African-American"]
    #race_Native American race_Asian race_Other race_Hispanic race_Caucasian
    S = np.transpose([list(new_data[i]) for i in S_keys])
    #S = np.array(S, dtype=np.int_)*2-1
    y = [v * 2.0 - 1.0 for v in new_data["two_year_recid_1"]]
    X_keys = set(new_data.keys()).difference([] + S_keys)
    X_keys_nonrace = set()
    for akey in X_keys:
        if akey.find("race") != 0:
            X_keys_nonrace.add(akey)
    X_keys = X_keys_nonrace
    print("X_keys=", len(X_keys), X_keys)
    #print list(race.keys())
    #X2_keys = set()
    X2_keys = set(["intercept"]).intersection(X_keys)
    print("X2 keys=", X2_keys)
    X2 = np.transpose([list(new_data[i]) for i in X2_keys])
    #print("X2=",str(X2))
    X2 = np.array(X2).reshape([len(new_data), len(X2_keys)])
    #print "X2=",X2.shape
    #print "X2=",X2
    X1_keys = X_keys.difference(X2_keys.union(set(["two_year_recid_1"])))
    if smlfeatures:
        X1_keys = X1_keys.difference(
            set([
                "out_custody", "decile_score", "in_custody", "c_jail_out",
                "c_jail_in", "screening_date", "v_decile_score"
            ]))
    X1 = np.transpose([list(new_data[i]) for i in X1_keys])
    print("X1 keys=", X1_keys)
    #sys.exit()
    #print "S=",S[:10]

    return np.array(S), np.array(X1), np.array(X2), np.array(y)
print("Sequential modeli çıkart...")
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000

# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250

# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data['Icerik'].values)
X = tokenizer.texts_to_sequences(data['Icerik'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
Y = pd.get_dummies(data['Kategori']).values
xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.05, random_state=42)
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.25))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
epochs = 5
batch_size = 128
history = model.fit(xTrain, yTrain, epochs=epochs, batch_size=batch_size, 
                    validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
acc = model.evaluate(xTest, yTest)
print(acc[1])
Example #45
0
df = pd.read_csv('data/titanic.csv')

# Descrevendo o dataset
df.info()

# Visualizando o dataset
df.head(5)

# Deletando as features que não tem importância no modelo: Nome, Código do Ticket e Código da Cabine:
df = df.drop(['Name','Ticket','Cabin', 'PassengerId'], axis = 1)

# Preenchendo os valores númericos nulos (NA) com a mediana.
df = df.fillna(df.median())

# Criando variaǘeis Dummy nas variáveis categóricas
df = pd.get_dummies(df ,prefix=['Sex', 'Embarked'], drop_first=True)

#Visualizando o dataset tratado:
df.head(10)

# Definindo as variáveis dependentes/independentes.
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

# Criando os subconjuntos de treinamento e testes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Normalizando as features 
X_train = feature_scaling(X_train)
X_test = feature_scaling(X_test)
Example #46
0
#Get Valid Columns
filter_col = [
    col for col in df if col.startswith('Claim') or col.startswith('Past')
]
filter_col.remove('ClaimAmount')
X1 = df[[
    'MonthNumber', 'MonthYear', 'MinDate', 'TotalDays', 'DaysInPolicy',
    'Species', 'Breed', 'AgeAtEnroll', 'MinAgeInDays', 'MaxAgeInDays',
    'TotalDaysInPolicy', 'TotalMonthsInPolicy'
]]
X2 = df[filter_col]
XCombined = X1.join(X2)

#One Hot Encode categorical variables
X = pd.get_dummies(XCombined,
                   prefix_sep="_",
                   columns=['Breed', 'Species', 'AgeAtEnroll'])

y = df[['MinDate', 'ClaimAmount', 'PetId']]

#Create Train/Test Splits
#Test on previous year for accuracy
date = pd.Timestamp(2018, 7, 1)
X_train = X.loc[X['MinDate'] < date]
y_train = y.loc[y['MinDate'] < date]

X_test = X.loc[X['MinDate'] == date]
y_test = y.loc[y['MinDate'] == date]

X_train.drop('MinDate', axis='columns', inplace=True)
X_test.drop('MinDate', axis='columns', inplace=True)
    f.write('NA,NA,140000\n')

# load the raw dataset from the created csv file

# if pandas is not installed, just uncomment the following line:
# !pip install pandas
import pandas as pd

data = pd.read_csv(data_file)
print(data)

############### 2.2.2. Handling Missing Data ###############
# NaN값은 missing values다. 이 값을 처리하기 위해서는 값을 채워 넣거나 삭제할 수 있다.
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = inputs.fillna(inputs.mean())
print(inputs)

# Alley 컬럼의 경우 Alley_Pave와 Alley_NaN를 1과 0으로 나눔
inputs = pd.get_dummies(inputs, dummy_na=True)

############### 2.2.3. Conversion to the ndarray Format ###############
# inputs과 outputs이 numerical이라면 ndarray format이 가능하다
from mxnet import np

X, y = np.array(inputs.values), np.array(outputs.values)
X
y

############### 2.2.4. Summary ###############
# like many other extension packages in the vast ecosystem of Python, pandas can work together with ndarray
# imputation and deletion can be used to handle missing data
Example #48
0
    def data_preparation(self, df5):

        ## 5.1. Data normalization

        ## 5.2. Data rescaling

        # Before choosing which rescale method will be used, we must know which variables have outliers.
        #sns.boxplot(df5['competition_distance'])

        # competition distance
        df5['competition_distance'] = self.competition_distance_scaler.fit_transform(
            df5[['competition_distance']].values)

        # competition time month
        df5['competition_time_month'] = self.time_month_scaler.fit_transform(
            df5[['competition_time_month']].values)

        # year
        df5['year'] = self.year_scaler.fit_transform(df5[['year']].values)

        # promo time week
        df5['promo_time_week'] = self.promo_time_week_scaler.fit_transform(
            df5[['promo_time_week']].values)

        #sns.distplot(df5['competition_distance'])

        ## 5.3. Data transformation

        ### 5.3.1. Encoding

        #df5.select_dtypes('object')

        # state holiday - One hot encoding
        df5 = pd.get_dummies(df5,
                             prefix=['state_holiday'],
                             columns=['state_holiday'])

        # store type - Label Encoder
        df5['store_type'] = self.encoding_store_type.fit_transform(
            df5['store_type'])

        # assortment - Ordinal Encoder
        assortment_dict = {'basic': 1, 'extra': 2, 'extended': 3}
        df5['assortment'] = df5['assortment'].map(assortment_dict)

        ### 5.3.1. Nature Transformation

        # day of week
        df5['day_of_week_sin'] = df5['day_of_week'].apply(
            lambda x: np.sin(x * (2 * np.pi / 7)))
        df5['day_of_week_cos'] = df5['day_of_week'].apply(
            lambda x: np.cos(x * (2 * np.pi / 7)))

        # day
        df5['day_sin'] = df5['day'].apply(lambda x: np.sin(x *
                                                           (2 * np.pi / 30)))
        df5['day_cos'] = df5['day'].apply(lambda x: np.cos(x *
                                                           (2 * np.pi / 30)))

        # month
        df5['month_sin'] = df5['month'].apply(
            lambda x: np.sin(x * (2 * np.pi / 12)))
        df5['month_cos'] = df5['month'].apply(
            lambda x: np.cos(x * (2 * np.pi / 12)))

        # week of year
        df5['week_of_year_sin'] = df5['week_of_year'].apply(
            lambda x: np.sin(x * (2 * np.pi / 52)))
        df5['week_of_year_cos'] = df5['week_of_year'].apply(
            lambda x: np.cos(x * (2 * np.pi / 52)))

        cols_selected = [
            'store', 'promo', 'store_type', 'assortment',
            'competition_distance', 'competition_open_since_month',
            'competition_open_since_year', 'promo2', 'promo2_since_week',
            'promo2_since_year', 'competition_time_month', 'promo_time_week',
            'day_of_week_sin', 'day_of_week_cos', 'day_sin', 'day_cos',
            'month_cos', 'month_sin', 'week_of_year_cos', 'week_of_year_sin'
        ]

        return df5[cols_selected]
# AGM
#data = data.drop (columns=[
	#'flowStartMilliseconds',
	#'sourceIPAddress',
	#'mode(destinationIPAddress)',
	#'mode(_tcpFlags)',
	#'Label',
	#'Attack' ])
	
#nominalFeatures = ['mode(sourceTransportPort)', 'mode(destinationTransportPort)', 'mode(protocolIdentifier)']

for nominal in nominalFeatures:
	freqValues = list(data[nominal].value_counts().iloc[:10].keys())
	data.loc[~data[nominal].isin(freqValues),nominal] = np.nan
data = pd.get_dummies (data, columns = nominalFeatures, drop_first = True, dtype=np.float64)

columns = list(data.columns)
allNominal = []
nominalColumns = {}
for feat in nominalFeatures:
	nominalColumns[feat] = [ i for i in range(len(columns)) if columns[i].startswith(feat+'_') ]
	allNominal +=  nominalColumns[feat]
	
vec = [False] * len(columns)
notAdded = [ ind for ind in range(len(columns)) if ind not in allNominal ]
notAddedNominal = nominalFeatures[:]

# downsample to 5%
_, data, _, labels = train_test_split (data, labels, test_size=0.05, random_state = 1, stratify=attacks)
data = minmax_scale (data)
### Pre-processing
# Creation of weekend variable from dayofweek variable
print(df.groupby(['dayofweek'])['loan_status'].value_counts(normalize=True),"\n")
df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>0 and x<4)  else 0)
# Result
print(df.weekend.value_counts(), "\n")
print(df.groupby(['weekend'])['loan_status'].value_counts(normalize=True))

# Convert categorical features to numerical values
df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)



### Feature selection
final_df = df[['loan_status','Principal','terms','age','Gender','weekend']]
final_df = pd.concat([final_df,pd.get_dummies(df['education'])], axis=1)
final_df.drop(['Master or Above'], axis = 1,inplace=True)

# Reduced dataset wih only the most significant variables
#final_df = df[['loan_status','age','Gender','weekend']]


# Finalize dataset
X = final_df.drop(columns = ['loan_status'])
X = preprocessing.StandardScaler().fit(X).transform(X)
y = final_df['loan_status'].values

# Split between train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
Example #51
0
# Create a boxplot of life expectancy per region
df.boxplot('life', 'Region', rot=60)

# Show the plot
plt.show()

#Creating dummy variables
# As Andy discussed in the video, scikit-learn does not accept non-numerical features.
# You saw in the previous exercise that the 'Region' feature contains very useful information
# that can predict life expectancy. For example, Sub-Saharan Africa has a lower life expectancy
# compared to Europe and Central Asia. Therefore, if you are trying to predict life expectancy,
# it would be preferable to retain the 'Region' feature. To do this, you need to binarize it by
# creating dummy variables, which is what you will do in this exercise.

# Create dummy variables: df_region
df_region = pd.get_dummies(df)

# Print the columns of df_region
print(df_region.columns)

# Create dummy variables with drop_first=True: df_region
df_region = pd.get_dummies(df, drop_first=True)

# Print the new columns of df_region
print(df_region.columns)

#Regression with categorical features
# Having created the dummy variables from the 'Region' feature, you can build
# regression models as you did before. Here, you'll use ridge regression to perform 5-fold cross-validation.
# The feature array X and target variable array y have been pre-loaded.
Example #52
0
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns
Example #53
0
intercolumnarDistance = dataset.iloc[:, 2].values
upperMargin = dataset.iloc[:, 3].values
lowerMargin = dataset.iloc[:, 4].values
exploitation = dataset.iloc[:, 5].values
rowNumber = dataset.iloc[:, 6].values
modularRatio = dataset.iloc[:, 7].values
interlinearSpacing = dataset.iloc[:, 8].values
weight = dataset.iloc[:, 9].values
peakNumber = dataset.iloc[:, 10].values
yvalue_class = dataset.iloc[:, 11].values #Y value, Vectorize

encoder = LabelEncoder()


y = encoder.fit_transform(yvalue_class)
Y = pd.get_dummies(y).values
Y = np.array(Y)

X = []
for a,b,c,d,e,f,g,h,i in zip(intercolumnarDistance, upperMargin, lowerMargin, exploitation, rowNumber, modularRatio, intercolumnarDistance, weight, peakNumber):
    X.append([a,b,c,d,e,f,g,h,i])
X = np.array(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y)

model = Sequential()

model.add(Dense(8, input_shape=(9, ), activation='softmax'))
model.add(Dense(10, activation='tanh'))
model.add(Dense(12, activation='relu'))
Example #54
0
from sklearn import model_selection
from sklearn.externals import joblib

#returns current working directory
os.getcwd()
#changes working directory
os.chdir("E:/")

titanic_train = pd.read_csv("train.csv")

#EDA
titanic_train.shape
titanic_train.info()

#data preparation
titanic_train1 = pd.get_dummies(titanic_train, columns=['Pclass', 'Sex', 'Embarked'])
titanic_train1.shape
titanic_train1.info()
titanic_train1.head(6)

#feature engineering 
X_train = titanic_train1.drop(['PassengerId','Age','Cabin','Ticket', 'Name','Survived'], 1)
y_train = titanic_train['Survived']

#build the decision tree model
dt = tree.DecisionTreeClassifier()
#use cross validation to estimate performance of model. 
#No model build during cross validation is not used as final model
cv_scores = model_selection.cross_val_score(dt, X_train, y_train, cv=10, verbose=1)
cv_scores.mean()
Example #55
0
#alter['ALTBE'].fillna('0', inplace=True)
#l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTBE']])
#alter.loc[np.where(l == 'null')[0],'ALTBE'] = '0'
#alter['ALTBE'] = np.array(
#        [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTBE']])

#alter['ALTAF'].fillna('0', inplace=True)
#l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTAF']])
#alter.loc[np.where(l == 'null')[0],'ALTAF'] = '0'
#alter['ALTAF'] = np.array(
#        [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTAF']])

#ALTDATE
alter['alt_year'], alter['alt_month'] = splitDate(alter['ALTDATE'])
alter_year_dummies = pd.get_dummies(alter['alt_year'])
alter_month_dummies = pd.get_dummies(alter['alt_month'])
addColumnsPrefix(alter_year_dummies, 'alter_year')
addColumnsPrefix(alter_month_dummies, 'alter_month')

alter_date_dummies = alter_year_dummies.join(alter_month_dummies)
alter_date_dummies[['EID']] = alter[['EID']]
alter_date_dummies = alter_date_dummies.groupby('EID').sum()
alter_date_dummies.reset_index(inplace=True)

#ALTERNO
alterno_dummies = pd.get_dummies(X_alter['ALTERNO'])
alterno_dummies[['EID']] = X_alter[['EID']]
alterno_dummies = alterno_dummies.groupby('EID').sum()
addColumnsPrefix(alterno_dummies, 'alterno')
alterno_dummies.reset_index(inplace=True)
Example #56
0
 def transform(self,df):
     df = pd.get_dummies(df, columns = self.column_array)
     if self.column_array_drop_first != [] :
         df = pd.get_dummies(df, columns = self.column_array_drop_first, drop_first = True)
     return df
    dep_air_train= (df_train_co[i,0],df_train_co[i,1])
    arr_air_train= (df_train_co[i,2],df_train_co[i,3])
    dist_train.append(vincenty(dep_air_train,arr_air_train).km)
    
df_dist_train= pd.DataFrame({'Distance': dist_train})
# find vincenty distance to create new feature distance on test
dist_test=[]
for i in range(0,2229):
    dep_air_test= (df_test_co[i,0],df_test_co[i,1])
    arr_air_test= (df_test_co[i,2],df_test_co[i,3])
    dist_test.append(vincenty(dep_air_test,arr_air_test).km)

df_dist_test= pd.DataFrame({'Distance': dist_test})

df_train.drop(df_train.columns[[0,2,3,4,6,7,8,11]], axis=1, inplace=True)
df_train= pd.concat([df_train,pd.get_dummies(df_train['Departure'],prefix='Departure'),pd.get_dummies(df_train['Arrival'],prefix='Arrival')],axis=1)
df_train.drop(['Departure'],axis=1,inplace=True)
df_train.drop(['Arrival'],axis=1,inplace=True)

df_train_w_dist= pd.concat([df_train,df_dist_train], axis=1)

df_test.drop(df_test.columns[[0,2,3,4,6,7,8]], axis=1, inplace=True)
df_test= pd.concat([df_test,pd.get_dummies(df_test['Departure'],prefix='Departure'),pd.get_dummies(df_test['Arrival'],prefix='Arrival')],axis=1)
df_test.drop(['Departure'],axis=1,inplace=True)
df_test.drop(['Arrival'],axis=1,inplace=True)

df_test_w_dist= pd.concat([df_test,df_dist_test],axis=1)
#special days
tygiving='10-25' #thangsgiving
mday='05-11' #mother's day
iday='07-04' #independence day
Example #58
0
    os.mkdir(dirname)
    dirname = 'HandwritingVerification'
    os.mkdir('plots/%s' %dirname)

if(not os.path.exists('plots/HandwritingVerification')):
    dirname = 'HandwritingVerification'
    os.mkdir('plots/%s' %dirname)
    dirname = 'Boosting'
    os.mkdir('plots/HandwritingVerification/%s' %dirname)

if(not os.path.exists('plots/HandwritingVerification/Boosting')):
    dirname = 'Boosting'
    os.mkdir('plots/HandwritingVerification/%s' %dirname)


df = pd.get_dummies(df)
# print(df.head(5))

Z = df.ix[:, df.columns != 'CLASS_DISTINCT']
X = Z.ix[:, Z.columns != 'CLASS_SAME']
y = df['CLASS_DISTINCT']


X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.30, random_state=30)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
def cleaning_data():
    # Importing the datasets
    portfolio = pd.read_json("portfolio.json", lines=True)
    profile = pd.read_json("profile.json", lines=True)
    transcript = pd.read_json("transcript.json", lines=True)

    # Data Cleaning of portfolio dataset
    ohe = {
        'email': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        'mobile': [1, 1, 1, 1, 0, 1, 1, 1, 1, 1],
        'social': [1, 1, 0, 0, 0, 1, 1, 1, 1, 0],
        'web': [0, 1, 1, 1, 1, 1, 1, 0, 1, 1]
    }

    ohx = pd.DataFrame(ohe, columns=['email', 'mobile', 'social', 'web'])

    cleaned_portfolio = portfolio
    cleaned_portfolio = pd.concat([portfolio, ohx], axis=1)
    cleaned_portfolio = cleaned_portfolio.drop(['channels'], axis=1)

    # converting duration from days to hours for better comparision
    cleaned_portfolio['duration'] = cleaned_portfolio['duration'] * 24

    # one hot encoding the offer_type column
    ohe = pd.get_dummies(cleaned_portfolio['offer_type'])
    cleaned_portfolio = pd.concat([cleaned_portfolio, ohe], axis=1)
    cleaned_portfolio = cleaned_portfolio.drop(['offer_type'], axis=1)

    # renaming the id column to offer_id
    cleaned_portfolio = cleaned_portfolio.rename(columns={'id': 'offer_id'})

    # Data Cleaning of profile dataset

    # To check the number of NULL values in each column
    # profile.isnull().sum()
    '''
    gender              2175
    age                    0
    id                     0
    became_member_on       0
    income              2175
    '''
    # Also on checking the age column against all the pts having gender and income
    # as Null we find that the corresponding age value is 118 which is quite
    # unusual. So in order to cleanse the data we drop all such points.

    # Dropping NULL values
    cleaned_profile = profile
    cleaned_profile = cleaned_profile.dropna()

    # Renaming the id column to customer_id
    cleaned_profile = cleaned_profile.rename(columns={'id': 'person_id'})

    # OneHotEncoding the gender column
    ohe = pd.get_dummies(cleaned_profile['gender'])
    cleaned_profile = pd.concat([cleaned_profile, ohe], axis=1)

    # To convert the became_member_on to date-time stamp because the machine will not
    # understand data corresponding to date in integer form.
    cleaned_profile['became_member_on'] = pd.to_datetime(
        cleaned_profile['became_member_on'], format='%Y%m%d').dt.date

    # We added a column today's date in the dataframe for refereence to calculate the no of days the customer has been a member of Starbucks
    cleaned_profile['today_date'] = pd.to_datetime('20200828', format='%Y%m%d')
    cleaned_profile['today_date'] = pd.to_datetime(
        cleaned_profile['today_date'], format='%Y%m%d').dt.date
    cleaned_profile['days_of_membership'] = cleaned_profile['today_date'].sub(
        cleaned_profile['became_member_on'], axis=0)

    # Taking a ratio of the subtracted dates to convert it into no.of.days
    cleaned_profile['days_of_membership'] = cleaned_profile[
        'days_of_membership'] / np.timedelta64(1, 'D')
    cleaned_profile['became_member_on'] = pd.to_datetime(
        cleaned_profile['became_member_on'], format='%Y-%m-%d').dt.year

    # Then we drop the reference column because it is not useful to us further analysis
    cleaned_profile = cleaned_profile.drop(['today_date'], axis=1)
    cleaned_profile['age_by_decade'] = pd.cut(cleaned_profile['age'],
                                              bins=range(10, 120, 10),
                                              right=False,
                                              labels=[
                                                  '10s', '20s', '30s', '40s',
                                                  '50s', '60s', '70s', '80s',
                                                  '90s', '100s'
                                              ])
    cleaned_profile['income_range'] = pd.cut(cleaned_profile['income'],
                                             bins=range(0, 120001, 10000),
                                             right=False,
                                             labels=[
                                                 '10k', '20k', '30k', '40k',
                                                 '50k', '60k', '70k', '80k',
                                                 '90k', '100k', '110k', '120k'
                                             ])

    # Data Cleaning of transcript.json
    cleaned_transcript = transcript

    # OneHotEncoding the event column
    ohy = pd.get_dummies(cleaned_transcript['event'])
    cleaned_transcript = pd.concat([cleaned_transcript, ohy], axis=1)
    cleaned_transcript = cleaned_transcript.drop(['event'], axis=1)

    # To delete all the information of the people had NULL values qhich we previously dropped.
    profile118 = profile[profile['age'] == 118]
    id118 = profile118['id']

    cleaned_transcript = cleaned_transcript[~cleaned_transcript['person'].
                                            isin(id118)]

    cleaned_transcript['record'] = cleaned_transcript.value.apply(
        lambda x: list(x.keys())[0])
    cleaned_transcript['record_value'] = cleaned_transcript.value.apply(
        lambda x: list(x.values())[0])
    cleaned_transcript.drop(['value'], axis=1, inplace=True)

    transactions = cleaned_transcript[cleaned_transcript.transaction == 1]
    offers = cleaned_transcript[cleaned_transcript.transaction != 1]

    # cleaning transactions
    transactions = transactions.drop(
        ['offer completed', 'offer viewed', 'offer received'], axis=1)
    transactions = transactions.drop(['transaction', 'record'], axis=1)
    transactions = transactions.rename(columns={'record_value': 'amount'})
    transactions['amount_range'] = pd.cut(
        transactions['amount'],
        bins=range(0, 1150, 50),
        right=False,
        labels=[
            '50', '100', '150', '200', '250', '300', '350', '400', '450',
            '500', '550', '600', '650', '700', '750', '800', '850', '900',
            '950', '1000', '1050', '1100'
        ])
    # cleaning offers
    offers = offers.drop(['transaction', 'record'], axis=1)
    offers = offers.rename(columns={'record_value': 'offer_id'})

    return cleaned_portfolio, cleaned_profile, offers, transactions
Example #60
-1
    def loaddataset(self,path,module):                
       df=pd.read_csv(path)
       subdf = df[['PassengerId','Pclass','Sex','Age','Embarked','Fare','SibSp','Parch']]
       SibSp=subdf['SibSp']
       Parch=subdf['Parch']
#      supplement Age
       Age=subdf['Age'].fillna(value=subdf.Age.mean())
             
       Fare=subdf['Fare'].fillna(value=subdf.Fare.mean())
       
       dummies_Sex=pd.get_dummies(subdf['Sex'],prefix='Sex')
       
       dummies_Embarked = pd.get_dummies(subdf['Embarked'], prefix= 'Embarked')     
       
       dummies_Pclass = pd.get_dummies(subdf['Pclass'], prefix= 'Pclass')
       
       PassengerId=subdf['PassengerId']
       
#      Age&Fare to Scaler
       scaler=MinMaxScaler()
       age_scaled=scaler.fit_transform(Age.values)
       fare_scaled=scaler.fit_transform(Fare.values)
       
       Age_Scaled=pd.DataFrame(age_scaled,columns=['Age_Scaled'])
       Fare_Scaled=pd.DataFrame(fare_scaled,columns=['Fare_Scaled'])
       
       if module=='train':
          self.trainlabel=df.Survived
          self.trainset=pd.concat([dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1)
       elif module=='test':
          self.testset=pd.concat([PassengerId,dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1)