def predictions(dataframe): dataframe['log_ENTRIESn_hourly'] = np.log1p(dataframe.ENTRIESn_hourly) # log transformation features = dataframe[[]] # option 2: features = dataframe[['meantempi', 'rain']] dummy_unit = pd.get_dummies(dataframe['UNIT'], prefix='unit') dummy_hour = pd.get_dummies(dataframe['hour'], prefix='hour') dummy_day_week = pd.get_dummies(dataframe['day_week'], prefix='day_week') features = features.join(dummy_hour).join(dummy_day_week).join(dummy_unit) #join(dummy_rain). # removing one dummy from each group to avoid dummy variable trap features.drop(['unit_R003'], axis = 1, inplace = True) features.drop(['hour_0'], axis = 1, inplace = True) features.drop(['day_week_0'], axis = 1, inplace = True) values = dataframe['ENTRIESn_hourly'] values_log = dataframe['log_ENTRIESn_hourly'] # Perform linear regression intercept, params = linear_regression_SGD(features, values_log) log_predictions = intercept + np.dot(features, params) log_predictions[log_predictions<0] = 1 predictions = np.expm1(log_predictions) # inverse logarithmic transformation to produce ENTRIESn_hourly residuals = values - predictions return predictions
def transform(self, X_df): X_encoded = X_df #uncomment the line below in the submission path = os.path.dirname(__file__) special_days=pd.read_csv(os.path.join(path, "data_specialdays.csv"), sep = ';') X_encoded = X_encoded.merge(special_days, how='left', left_on=['DateOfDeparture'], right_on=['DateOfDeparture'], sort=False) X_encoded = X_encoded.join(pd.get_dummies(X_encoded['Departure'], prefix='d')) X_encoded = X_encoded.join(pd.get_dummies(X_encoded['Arrival'], prefix='a')) X_encoded['DateOfDeparture'] = pd.to_datetime(X_encoded['DateOfDeparture']) X_encoded['year'] = X_encoded['DateOfDeparture'].dt.year X_encoded['weekday'] = X_encoded['DateOfDeparture'].dt.weekday X_encoded['week'] = X_encoded['DateOfDeparture'].dt.week X_encoded = X_encoded.join(pd.get_dummies(X_encoded['year'], prefix='y')) X_encoded = X_encoded.join(pd.get_dummies(X_encoded['weekday'], prefix='wd')) X_encoded = X_encoded.join(pd.get_dummies(X_encoded['week'], prefix='w')) X_encoded = X_encoded.drop('Departure', axis=1) X_encoded = X_encoded.drop('Arrival', axis=1) X_encoded = X_encoded.drop('weekday', axis=1) X_encoded = X_encoded.drop('week', axis=1) X_encoded = X_encoded.drop('year', axis=1) X_encoded = X_encoded.drop('std_wtd', axis=1) X_encoded = X_encoded.drop('WeeksToDeparture', axis=1) X_encoded = X_encoded.drop('DateOfDeparture', axis=1) X_array = X_encoded.values return X_array
def eval_wrapper(ypred, ytrue): #pred true #make sure 012 #ypred=np.concatenate((ypred,np.array([0,1,2]) )) ypred is [0.1 0.2 0.7] ytrue=np.concatenate((ytrue,np.array([0,1,2]) )) print ypred.shape,ytrue.shape if len(ytrue.shape)!=2: dmmat=pd.get_dummies(np.array(ytrue)) ytrue=dmmat.values #[n,3] if len(ypred.shape)!=2: dmmat=pd.get_dummies(np.array(ypred)) ypred=dmmat.values #[n,3] ytrue=ytrue[:-3,:] #y = np.array(y);print y[:10] #y = y.astype(int);print yhat[:10] #yhat = np.array(yhat) #yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int) #####accuracy #err=np.sum((y-yhat)*(y-yhat))/float(y.shape[0]) #return err #######-loglikely return -np.mean( np.sum(ytrue*np.log(ypred+0.00001), axis=1) )#[n,3]->[n,]->1x1
def getdummy(rawData,categories,stage): #make a copy data = rawData.copy() if stage == "training": for category in categories: columns = list(data.columns.values) #print data[category] columnValues = set(data[category]) #print columnValues dummy = pd.get_dummies(data[category],prefix=category) #print dummy.head(10) if dummy.shape[1] > 1: columns.remove(category) data = data[columns].join(dummy.ix[:,1:]) elif dummy.shape[1] == 1: columns.remove(category) data = data[columns].join(dummy) if stage == "testing": #print categories columns = list(data.columns.values) for category in categories: columnValues = set(data[category]) #print columnValues dummy = pd.get_dummies(data[category],prefix=category) #print dummy.head(10) dummyColumns = list(dummy.columns.values) for dummyColumn in dummyColumns: if dummyColumn in columns: data[dummyColumn] = dummy[dummyColumn] columns.remove(category) data = data[columns] #print dummy.head(10) return(data)
def getdummy(rawData,categories,stage,shuffle=False): #make a copy data = rawData.copy() if stage == "training": for category in categories: columns = list(data.columns.values) columnValues = set(data[category]) dummy = pd.get_dummies(data[category],prefix=category) if dummy.shape[1] > 1: columns.remove(category) data = data[columns].join(dummy.ix[:,1:]) elif dummy.shape[1] == 1: columns.remove(category) data = data[columns].join(dummy) #shuffle data if shuffle == True: data = shuffledata(data) if stage == "testing": columns = list(data.columns.values) for category in categories: columnValues = set(data[category]) dummy = pd.get_dummies(data[category],prefix=category) dummyColumns = list(dummy.columns.values) for dummyColumn in dummyColumns: if dummyColumn in columns: data[dummyColumn] = dummy[dummyColumn] columns.remove(category) data = data[columns] return(data)
def test_basic(self, sparse, dtype): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype=self.effective_dtype(dtype)) result = get_dummies(s_list, sparse=sparse, dtype=dtype) if sparse: tm.assert_sp_frame_equal(result, expected.to_sparse(kind='integer', fill_value=0)) else: assert_frame_equal(result, expected) result = get_dummies(s_series, sparse=sparse, dtype=dtype) if sparse: expected = expected.to_sparse(kind='integer', fill_value=0) assert_frame_equal(result, expected) expected.index = list('ABC') result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) if sparse: expected.to_sparse(kind='integer', fill_value=0) assert_frame_equal(result, expected)
def create_dummy_variables(df): #all the quantitave variables are collect with the describe function quant_variable = df.describe().columns.values column = df.columns.values df = pd.get_dummies(df) return df print("DUMMY:") print(dummy_variable.head(5)) dummy_variable.to_csv("output/train_fixed.csv", dummy_na=True) for i in column: if i not in quant_variable: #we are with qualitative variable df[i].fillna("no_present", inplace=True) dummy_variable = pd.get_dummies(df[i], prefix=i) print("DUMMY:") print(dummy_variable.head(5)) print("COLUMN: ", i) print(dummy_variable.info()) for dummy in dummy_variable: #for value in dummy_variable[dummy]: # print(value) #df.loc[dummy] = dummy_variable[dummy] df[dummy] = dummy_variable[dummy] #df = df.join(dummy_variable) df.drop(i, axis=1, inplace=True) #df.reindex(columns = dummy_variable.columns) #print(test.info()) df.to_csv("output/train_fixed2.csv") sys.exit() return df
def test_include_na(self, sparse, dtype): if sparse: pytest.xfail(reason='nan in index is problematic (GH 16894)') s = ['a', 'b', np.nan] res = get_dummies(s, sparse=sparse, dtype=dtype) exp = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) exp_na = DataFrame({nan: [0, 0, 1], 'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) exp_na = exp_na.reindex(['a', 'b', nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=sparse, dtype=dtype) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], dtype=self.effective_dtype(dtype)) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def test_dataframe_dummies_with_na(self, df, sparse, dtype): df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index(axis=1) if sparse: arr = SparseArray typ = SparseDtype(dtype, 0) else: arr = np.array typ = dtype expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': arr([1, 0, 1, 0], dtype=typ), 'A_b': arr([0, 1, 0, 0], dtype=typ), 'A_nan': arr([0, 0, 0, 1], dtype=typ), 'B_b': arr([1, 1, 0, 0], dtype=typ), 'B_c': arr([0, 0, 1, 0], dtype=typ), 'B_nan': arr([0, 0, 0, 1], dtype=typ) }).sort_index(axis=1) assert_frame_equal(result, expected) result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected)
def slide_17(): df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)}) print pd.get_dummies(df['key']) dummies = pd.get_dummies(df['key'], prefix='key') print dummies df_with_dummy = df[['data1']].join(dummies) print df_with_dummy mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table(MOVIELENSPATH, sep='::', header=None, engine='python', names=mnames) print movies[:10] genre_iter = (set(x.split('|')) for x in movies.genres) genres = sorted(set.union(*genre_iter)) print genres dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres) for i, gen in enumerate(movies.genres): dummies.ix[i, gen.split('|')] = 1 movies_windic = movies.join(dummies.add_prefix('Genre_')) print movies_windic.ix[0] values = np.random.rand(10) print values bins = [0, 0.2, 0.4, 0.6, 0.8, 1] print pd.get_dummies(pd.cut(values, bins))
def __init__(self, x_train, y_train, x_test=None, y_test=None, n_hidden_1=256, n_hidden_2=256, batch_size=100, learning_rate=0.01, training_epochs=30, display_step=-1): self.learning_rate = learning_rate self.training_epochs = training_epochs self.batch_size = batch_size if display_step == -1: display_step = int(training_epochs / 26) self.display_step = display_step self.n_hidden_1 = n_hidden_1 # 1st layer num features self.n_hidden_2 = n_hidden_2 # 2nd layer num features y_train = pd.get_dummies(y_train).as_matrix() if y_test is not None: y_test = pd.get_dummies(y_test).as_matrix() self.n_input = x_train.shape[1] self.n_classes = y_train.shape[1] self.x = tf.placeholder(tf.float32, [None, self.n_input]) self.y = tf.placeholder(tf.float32, [None, self.n_classes]) self.x_train = x_train self.y_train = y_train self.x_test = x_test self.y_test = y_test
def prepare_data(original_data): original_data['TotalIncome'] = original_data['ApplicantIncome'] + original_data['CoapplicantIncome'] original_data['TotalIncome_log'] = np.log(original_data['TotalIncome']) original_data['LoanAmount_log'] = np.log(original_data['LoanAmount']) one_hot_encoding_data = pd.concat([pd.get_dummies(original_data['Gender']), pd.get_dummies(original_data['Married'], prefix = "Married"), original_data['Dependents'], pd.get_dummies(original_data['Education'], prefix = "Education"), pd.get_dummies(original_data['Self_Employed'], prefix="Self_Employed"), original_data['TotalIncome_log'],#+original_data['CoapplicantIncome'], original_data['LoanAmount_log'], original_data['Loan_Amount_Term'], original_data['Credit_History'], pd.get_dummies(original_data['Property_Area'], prefix="Property_Area") ], axis =1) #one_hot_encoding_data.drop('Female', 1, inplace=True) #one_hot_encoding_data.drop('Married_Yes', 1, inplace=True) #one_hot_encoding_data.drop('Education_Not Graduate', 1, inplace=True) #one_hot_encoding_data.drop('Self_Employed_No', 1, inplace=True) one_hot_encoding_data[one_hot_encoding_data.Dependents == '3+'] = 5 median_features = one_hot_encoding_data.dropna().median() print median_features imputed_features = one_hot_encoding_data.fillna(median_features) #print imputed_features.head(1) #print imputed_features.count() return imputed_features
def predictions_sm(weather_turnstile): # # Your implementation goes here. Feel free to write additional # helper functions # values = weather_turnstile['ENTRIESn_hourly'] # get weekday weather_turnstile['DATEn'] = pd.to_datetime(weather_turnstile['DATEn']) weather_turnstile['weekend'] = weather_turnstile['DATEn'].dt.dayofweek days = {0: 'Mon', 1: 'Tues', 2: 'Weds', 3: 'Thurs', 4: 'Fri', 5: 'Sat', 6: 'Sun'} weather_turnstile['weekend'] = weather_turnstile['weekend'].apply(lambda x: days[x]) features = weather_turnstile[['maxpressurei', 'maxdewpti', 'mindewpti', 'minpressurei', 'meandewpti', 'precipi', 'fog', 'rain', 'meanwindspdi', 'mintempi', 'meantempi', 'maxtempi', 'meanpressurei']] # dummy variables for UNIT, weekend and Hour dummy_units = pd.get_dummies(weather_turnstile['UNIT'], prefix='unit') features = features.join(dummy_units) dummy_units = pd.get_dummies(weather_turnstile['weekend'], prefix='day') features = features.join(dummy_units) dummy_units = pd.get_dummies(weather_turnstile['Hour'], prefix='hour') features = features.join(dummy_units) features, mu, sigma = normalize_features(features) features = sm.add_constant(features) # train, fit and predict model model = sm.OLS(values, features) results = model.fit() prediction = model.predict(results.params, features) return prediction
def test_include_na(self, sparse, dtype): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=sparse, dtype=dtype) exp = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) if sparse: exp = exp.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) exp_na = DataFrame({nan: [0, 0, 1], 'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) exp_na = exp_na.reindex(['a', 'b', nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns if sparse: exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=sparse, dtype=dtype) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], dtype=self.effective_dtype(dtype)) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def binarize_data(self): fn = os.path.join(DATA_DIR, 'all_data_nn.csv') df = pd.read_csv(fn) df['norm_age'] = (df['age_at_ins'] - min(df['age_at_ins']))/(max(df['age_at_ins']) - min(df['age_at_ins'])) print df.columns # categorical features cat_features = ['make', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'nvcat'] # binarize categorical features binarized_df1 = pd.get_dummies(df[cat_features]) binarized_df2 = pd.get_dummies(df['ordcat']) binarized_df2.columns = ['ordcat_1', 'ordcat_2', 'ordcat_3', 'ordcat_4', 'ordcat_5', 'ordcat_6', 'ordcat_7'] binarized_df = pd.concat([df[['rowid', 'var4', 'var5', 'var7', 'nvvar1', 'nvvar2', 'nvvar3', 'nvvar4', 'response', 'ind', 'norm_age']], binarized_df1, binarized_df2], axis=1) # remove columns such that there n-1 features for a caterical variable with n values rem_list = ['make_Z', 'cat1_G', 'cat2_C', 'cat3_F', 'cat4_C', 'cat5_C', 'cat6_F', 'cat7_D', 'cat8_C', 'cat9_B', 'cat10_C', 'cat11_F', 'cat12_F', 'nvcat_O', 'ordcat_7'] binarized_df = binarized_df.drop(rem_list, axis=1) binarized_df.to_csv('all_data_nn_binarized.csv', index=False, index_label=False)
def transform(self, X_df): X_encoded = X_df #uncomment the line below in the submission path = os.path.dirname(__file__) X_encoded = X_df X_encoded = X_encoded.join(pd.get_dummies(X_encoded['Departure'], prefix='d')) X_encoded = X_encoded.join(pd.get_dummies(X_encoded['Arrival'], prefix='a')) X_encoded = X_encoded.drop('Departure', axis=1) X_encoded = X_encoded.drop('Arrival', axis=1) #data_holidays = pd.read_csv("data_holidays_2.csv") data_holidays = pd.read_csv(os.path.join(path, "data_holidays_2.csv")) X_holidays = data_holidays[['DateOfDeparture','Xmas','Xmas-1','NYD','NYD-1','Ind','Thg','Thg+1','Lab','Mem']] X_encoded = X_encoded.merge(X_holidays, how='left', left_on=['DateOfDeparture'], right_on=['DateOfDeparture'], sort=False) X_encoded['DateOfDeparture'] = pd.to_datetime(X_encoded['DateOfDeparture']) X_encoded['year'] = X_encoded['DateOfDeparture'].dt.year X_encoded['weekday'] = X_encoded['DateOfDeparture'].dt.weekday X_encoded['week'] = X_encoded['DateOfDeparture'].dt.week X_encoded = X_encoded.join(pd.get_dummies(X_encoded['year'], prefix='y')) X_encoded = X_encoded.join(pd.get_dummies(X_encoded['weekday'], prefix='wd')) X_encoded = X_encoded.join(pd.get_dummies(X_encoded['week'], prefix='w')) X_encoded = X_encoded.drop('weekday', axis=1) X_encoded = X_encoded.drop('week', axis=1) X_encoded = X_encoded.drop('year', axis=1) X_encoded = X_encoded.drop('std_wtd', axis=1) X_encoded = X_encoded.drop('WeeksToDeparture', axis=1) X_encoded = X_encoded.drop('DateOfDeparture', axis=1) X_array = X_encoded.values return X_array
def dataPreprocessing(filename): data_train=pd.read_csv(filename) data_train,rfr = set_missing_ages(data_train) data_train = set_Cabin_type(data_train) #将类别数据离散化 dummies_Cabin=pd.get_dummies(data_train['Cabin'],prefix='Cabin') dummies_Embarked=pd.get_dummies(data_train['Embarked'],prefix='Embarked') dummies_Pclass=pd.get_dummies(data_train['Pclass'],prefix='Pclass') dummies_Sex=pd.get_dummies(data_train['Sex'],prefix='Sex') #将原始的 Cabin Embarked Pclass Sex删除 data_train.drop(['Cabin','Embarked','Pclass','Sex'],axis=1,inplace=True) #构造新的dataFrame df=pd.concat([data_train,dummies_Cabin,dummies_Embarked,dummies_Pclass,dummies_Sex],axis=1) #将 Age 与 Fare做归一化处理,利用sklearn 中的preprocessing模块 #实例化一个StandardScaler对象 ps=preprocessing.StandardScaler() Age_scale_param=ps.fit(df['Age']) df['Age_scaled']=ps.fit_transform(df['Age'],Age_scale_param) Fare_scale_param=ps.fit(df['Fare']) df['Fare_scaled']=ps.fit_transform(df['Fare'],Fare_scale_param) return df,rfr
def get_votes_data(votes_df): """creates dummies, converts dates, and gets counts for votes""" votes_df["date"] = pd.to_datetime(votes_df.date) votes_df["num_yes"] = votes_df.votes.map(lambda x: len(x.get("Yea", x.get("Aye", [])))) votes_df["num_no"] = votes_df.votes.map(lambda x: len(x.get("No", x.get("Nay", [])))) votes_df["num_not_voting"] = votes_df.votes.map(lambda x: len(x.get("Not Voting", []))) votes_df["num_present"] = votes_df.votes.map(lambda x: len(x.get("Present", []))) votes_df["percent_yes_D"] = votes_df.votes.map( lambda x: get_precent_party(x.get("Yea", x.get("Aye", [])))["countD"] ) votes_df["percent_no_D"] = votes_df.votes.map(lambda x: get_precent_party(x.get("No", x.get("Nay", [])))["countD"]) votes_df["percent_yes_R"] = votes_df.votes.map( lambda x: get_precent_party(x.get("Yea", x.get("Aye", [])))["countR"] ) votes_df["percent_no_R"] = votes_df.votes.map(lambda x: get_precent_party(x.get("No", x.get("Nay", [])))["countR"]) votes_df["percent_not_voting_D"] = votes_df.votes.map( lambda x: get_precent_party(x.get("Not Voting", []))["countD"] ) votes_df["percent_not_voting_R"] = votes_df.votes.map( lambda x: get_precent_party(x.get("Not Voting", []))["countR"] ) votes_df["percent_present_D"] = votes_df.votes.map(lambda x: get_precent_party(x.get("Present", []))["countD"]) votes_df["percent_present_R"] = votes_df.votes.map(lambda x: get_precent_party(x.get("Present", []))["countR"]) votes_df["is_amendment"] = votes_df.amendment.notnull() votes_df = pd.concat([votes_df, pd.get_dummies(votes_df.category)], axis=1) votes_df.drop("unknown", axis=1, inplace=True) votes_df = pd.concat([votes_df, pd.get_dummies(votes_df.requires)], axis=1) votes_df.drop("3/5", axis=1, inplace=True) votes_df = pd.concat([votes_df, pd.get_dummies(votes_df.session)], axis=1) votes_df.drop("2002", axis=1, inplace=True) return votes_df
def model1(title): df = pd.read_csv('./data/train.csv') df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1) df['Gender']= df['Sex'].map({'female':0, 'male': 1}).astype(int) age_mean = df['Age'].mean() mode_embarked = mode(df['Embarked'])[0][0] df['Embarked'] = df['Embarked'].fillna(mode_embarked) df['Age'] = df['Age'].fillna(age_mean) df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1) df = df.drop(['Sex', 'Embarked'], axis=1) cols = df.columns.tolist() cols = [cols[1]] + cols[0:1] + cols[2:] df = df[cols] train_data = df.values #rf(train_data[0:, 2:], train_data[0:,0], train_data[0:, 2:], train_data[0:,0]) #model = RandomForestClassifier(n_estimators=100) #model = model.fit(train_data[0:, 2:], train_data[0:,0]) df_test = pd.read_csv('./data/test.csv') df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1) df_test['Gender']= df_test['Sex'].map({'female':0, 'male': 1}).astype(int) age_mean = df_test['Age'].mean() df_test['Age'] = df_test['Age'].fillna(age_mean) fare_means = df.pivot_table('Fare', index='Pclass', aggfunc='mean') df_test['Fare'] = df_test[['Fare', 'Pclass']].apply(lambda x: fare_means[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1) df_test = pd.concat([df_test, pd.get_dummies(df_test['Embarked'], prefix='Embarked')], axis=1) df_test = df_test.drop(['Sex', 'Embarked'], axis=1) test_data = df_test.values if title.rf: rf(train_data[0:, 2:], train_data[0:,0], train_data[0:, 2:], train_data[0:,0]) if title.mlp: nn(train_data[0:, 2:], train_data[0:,0], train_data[0:, 2:], train_data[0:,0])
def preprocess_data(data): data['Title'] = data['Name'].apply(get_title) data=data.drop('Name', axis=1) data=data.drop('Cabin', axis=1) data=data.drop('Ticket', axis=1) # data['Age'].fillna(data['Age'].mean(), inplace=True) process_age(data) data['Age'].fillna(data['Age'].mean(), inplace=True) print (data.info()) data['Fare'].fillna(data['Fare'].mean(), inplace=True) data['Embarked'].fillna('S', inplace=True) gender_dummy=pd.get_dummies(data['Sex']) data=pd.concat([data, gender_dummy], axis=1) data=data.drop('Sex', axis=1) data=data.drop('Title', axis=1) gender_dummy=pd.get_dummies(data['Embarked']) data=pd.concat([data, gender_dummy], axis=1) data=data.drop('Embarked', axis=1) return data
def transform(self, X_df): X_encoded = X_df # uncomment the line below in the submission # path = os.path.dirname(__file__) X_encoded = X_df X_encoded = X_encoded.join(pd.get_dummies(X_encoded["Departure"], prefix="d")) X_encoded = X_encoded.join(pd.get_dummies(X_encoded["Arrival"], prefix="a")) X_encoded = X_encoded.drop("Departure", axis=1) X_encoded = X_encoded.drop("Arrival", axis=1) # data_holidays = pd.read_csv(os.path.join(path, "data_holidays.csv")) # X_holidays = data_holidays[['DateOfDeparture','Xmas','Xmas-1','NYD','NYD-1','Ind','Thg','Thg+1']] # X_encoded = X_encoded.set_index(['DateOfDeparture']) # X_holidays = X_holidays.set_index(['DateOfDeparture']) # X_encoded = X_encoded.join(X_holidays).reset_index() X_encoded["DateOfDeparture"] = pd.to_datetime(X_encoded["DateOfDeparture"]) X_encoded["year"] = X_encoded["DateOfDeparture"].dt.year X_encoded["weekday"] = X_encoded["DateOfDeparture"].dt.weekday X_encoded["week"] = X_encoded["DateOfDeparture"].dt.week X_encoded = X_encoded.join(pd.get_dummies(X_encoded["year"], prefix="y")) X_encoded = X_encoded.join(pd.get_dummies(X_encoded["weekday"], prefix="wd")) X_encoded = X_encoded.join(pd.get_dummies(X_encoded["week"], prefix="w")) X_encoded = X_encoded.drop("weekday", axis=1) X_encoded = X_encoded.drop("week", axis=1) X_encoded = X_encoded.drop("year", axis=1) X_encoded = X_encoded.drop("std_wtd", axis=1) X_encoded = X_encoded.drop("WeeksToDeparture", axis=1) X_encoded = X_encoded.drop("DateOfDeparture", axis=1) X_array = X_encoded.values return X_array
def gridtrainfraction(trainiter, rfparams): ''' read in data once for grid search, clear, then again for model fit''' train = fractionate(trainiter, fraction=0.002) clf = RandomForestClassifier(**rfparams) grid = GridSearchCV(clf, param_grid=gridparams, scoring='log_loss', n_jobs=1) X_train = train.drop('hotel_cluster', axis=1) X = sparsify(pd.get_dummies(X_train.astype(str))) y = train['hotel_cluster'] grid.fit(X,y) print(grid.best_params_) print(grid.grid_scores_) train = None X_train = None X = None y = None clf = None train = fractionate(trainiter, fraction=0.01) X_train = train.drop('hotel_cluster', axis=1) X = sparsify(pd.get_dummies(X_train.astype(str))) y = train['hotel_cluster'] bestparams = grid.best_params_ clf = RandomForestClassifier(**rfparams) clf.set_params(**bestparams) clf.fit(X,y) return clf
def predictions(dataframe): features = dataframe[['meantempi']] dummy_unit = pd.get_dummies(dataframe['UNIT'], prefix='unit') dummy_hour = pd.get_dummies(dataframe['Hour'], prefix='hour') date_fn_input = dataframe['DATEn'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d')) day_of_wk = date_fn_input.apply(lambda x: datetime.strftime(x, '%w')) # isolate day of the week for dummy variable dummy_day_of_wk = pd.get_dummies(day_of_wk, prefix='day_of_wk') features = features.join(dummy_unit).join(dummy_hour).join(dummy_day_of_wk) values = dataframe['ENTRIESn_hourly'] features_array = features.values values_array = values.values print pd.DataFrame(features_array).head() means, std_devs, normalized_features_array = normalize_features(features_array) # Perform ordinary least squares regression predictions_OLS = norm_intercept + np.dot(normalized_features_array, norm_params) # print pd.DataFrame(predictions).head() # Perform gradient descent # Perform stochastic gradient descent '''norm_intercept, norm_params = linear_regression_SGD(normalized_features_array, values_array) intercept, params = recover_params(means, std_devs, norm_intercept, norm_params) predictions = intercept + np.dot(features_array, params)''' # The following line would be equivalent: return predictions_OLS, predictions_GD, predictions_SGD
def _addFeatures(self, df): #Add additional features to train/test dataframe if len(df)>0: df = df.reset_index() df['hour'] = df.apply(lambda row: row['pick_date'].hour, axis=1) df['min'] = df.apply(lambda row: row['pick_date'].hour * 60 + row['pick_date'].minute, axis=1) df['weekend'] = df.apply(lambda row: f_is_weekend(row['pick_date']), axis=1) #binary feature tod = df.apply(lambda row: f_tod(row['pick_date']), axis=1) #morning, midday, afternoon, envening, night weekday = df.apply(lambda row: row['pick_date'].weekday(), axis=1) #0-6 #dummify categorical features (drop one dummy and append the rest to the resulting dataframe) #append referent values to interim data to make sure all columns are present in the output (even if dataset does not #contain all values in tod/weekday column) ref_vals = ['morning', 'midday', 'afternoon', 'evening', 'night'] tod = pd.Series(list(itertools.chain(tod, ref_vals))) tod = pd.get_dummies(tod, prefix = 'tod') df = df.join(tod.ix[:len(df), ['tod_'+i for i in ref_vals[1:]]]) ref_vals = range(0,7) weekday = pd.Series(list(itertools.chain(weekday, ref_vals))) weekday = pd.get_dummies(weekday, prefix = 'weekday') df = df.join(weekday.ix[:len(df), ['weekday_'+str(i) for i in ref_vals[1:]]]) return df
def Onehot_Encoding_DD(New_DS, Train_DS, y): #cos_dist_T = Get_similarity_matrix(Train_DS,y) cos_dist_T = Get_similarity_matrix_DD2(New_DS) #one hot encoding for DepartmentDescription print("one hot encoding sales - DepartmentDescription at Time: %s" %(tm.strftime("%H:%M:%S"))) dummies = pd.get_dummies(New_DS['DepartmentDescription']) DeptDesc_cols = [ 'DD'+"_buy1_"+str(s) for s in list(dummies.columns)] sim_dd_buy = cos_dist_T sim_dd_buy.columns = DeptDesc_cols sim_dd_buy = sim_dd_buy.reset_index() cols = ['VisitNumber','ScanCount','DepartmentDescription'] New_DS = New_DS[cols].merge(sim_dd_buy,left_on='DepartmentDescription',right_on='index',how='left') New_DS = New_DS.drop(['index'], axis = 1) #get "buying" qty for DepartmentDescription Temp_Scan = pd.DataFrame() Temp_Scan['ScanCount'] = New_DS ['ScanCount'] Temp_Scan['ScanCount'] = np.where(New_DS ['ScanCount']>= 0,New_DS ['ScanCount'],0).astype(int) for i in range(len(DeptDesc_cols)): New_DS[DeptDesc_cols[i]] = New_DS[DeptDesc_cols[i]] * Temp_Scan ['ScanCount'] del sim_dd_buy ##----------------------------------------------------------------------------------------------------------------## print("one hot encoding return - DepartmentDescription at Time: %s" %(tm.strftime("%H:%M:%S"))) #one hot encoding for DepartmentDescription - Return dummies = pd.get_dummies(New_DS['DepartmentDescription']) DeptDesc_cols = [ 'DD'+"_ret1_"+str(s) for s in list(dummies.columns)] sim_dd_ret = cos_dist_T sim_dd_ret.columns = DeptDesc_cols sim_dd_ret = sim_dd_ret.reset_index() New_DS = New_DS.merge(sim_dd_ret,left_on='DepartmentDescription',right_on='index',how='left') New_DS = New_DS.drop(['index'], axis = 1) #get "return" qty for DepartmentDescription Temp_Scan['ScanCount'] = New_DS ['ScanCount'] Temp_Scan['ScanCount'] = np.where(New_DS ['ScanCount'] < 0,New_DS ['ScanCount']*-1,0).astype(int) for i in range(len(DeptDesc_cols)): New_DS[DeptDesc_cols[i]] = New_DS[DeptDesc_cols[i]] * Temp_Scan ['ScanCount'] del sim_dd_ret ##----------------------------------------------------------------------------------------------------------------## New_DS = New_DS.drop(['ScanCount','DepartmentDescription'], axis = 1) New_DS = New_DS.groupby('VisitNumber').sum().reset_index() print(np.shape(New_DS)) #pd.DataFrame(New_DS).to_csv(file_path+'New_DS.csv') return New_DS
def transform_big(data): data = data.copy() shot_type = pd.get_dummies(data["Shot Type"].apply(shot)) data["Shot Dist."] = data["Shot Dist."].apply(lambda x : x.replace("ft.", "")) data["Shot Dist."] = data["Shot Dist."].apply(lambda x : 0 if x== "" else float(x)) # shot_clock = data["Shot Clock"].apply(lambda x: 0 if x == "" else float(x)) # touch_time = data["Touch Time"].apply(lambda x: float(x)) # drib = data["Drib."].apply(lambda x: int(x)) data["Def Dist."] = data["Def Dist."].apply(lambda x: float(x)) # def_dist_c = pd.get_dummies(data["Def Dist."].apply(def_dist)) player_c = pd.get_dummies(data["Player"]) shot_dist_c = pd.get_dummies(data["Shot Dist."].apply(shot_dist)) con = [player_c, shot_type , data["Def Dist."], shot_dist_c, data["Shot Dist."],(data["Made?"]=="Yes").astype(int)] # con = [player_c, shot_type, shot_clock, touch_time, drib, # shot_dist_c, data["Shot Dist."],(data["Made?"]=="Yes").astype(int)] new_shot_chart = pd.concat(con , axis=1) pred = player_c.columns[:len(player_c)].tolist()+ ['Shot Dist.', 'Def Dist.', 'else', 'jump', 'layup', 'Made?'] return new_shot_chart[pred]
def trans2vect(data): item_vec = data.reindex(columns=orin_name) # dummy capsule = pd.get_dummies(data.CAPSULE_TEXT, prefix='cap_') genre = pd.get_dummies(data.GENRE_NAME, prefix='gen_') large_area = pd.get_dummies(data.large_area_name, prefix='larg_area_') ken_name = pd.get_dummies(data.ken_name, prefix='ken_') small_name = pd.get_dummies(data.small_area_name, prefix='small_area_') # time dispfrom = pd.to_datetime(data.DISPFROM) item_vec['dispfrom'] = [x.dayofyear for x in dispfrom] dispend = pd.to_datetime(data.DISPEND) item_vec['dispend'] = [x.dayofyear for x in dispend] validfrom = pd.to_datetime(data.VALIDFROM) item_vec['validfrom'] = [x.dayofyear for x in validfrom] validend = pd.to_datetime(data.VALIDEND) item_vec['validend'] = [x.dayofyear for x in validend] # join item_vec = item_vec.join([capsule, genre, large_area, ken_name, small_name]) item_vec.index = data.COUPON_ID_hash item_vec = item_vec.fillna(0) # feature engineering item_vec.DISCOUNT_PRICE = 1 / np.log10(item_vec.DISCOUNT_PRICE) item_vec.CATALOG_PRICE = 1 / np.log10(item_vec.CATALOG_PRICE) item_vec.PRICE_RATE = (item_vec.PRICE_RATE ** 2) / (100 * 100) scale_name = [u'DISPPERIOD', u'VALIDPERIOD',u'dispfrom', u'dispend', u'validfrom', u'validend'] for i in scale_name: item_vec[i] = scale(item_vec[i]) return item_vec
def get_data_frame_with_dummies(users): users_ref = users.copy() base_dummies = None categories = {'gender': ['male', 'female'], 'education': ['overGraduate', 'university', 'underHigh'], 'income': ['100', '200', '300', '400', '500', '1200more'], 'job': ['officer', 'student', 'etc'], 'marriage': ['married', 'single'], 'religion': ['buddhist', 'none', 'christian', 'romanCatholicism']} age_bins = [10, 20, 30, 40, 50, 60, 70] numChild_bins = [0, 1, 10] for label_type in users_ref.columns: temp_dummies = None if label_type == 'age': temp_dummies = pd.get_dummies(pd.cut(users_ref[label_type], age_bins, right=False), prefix=label_type) elif label_type == 'numberOfChildren': temp_dummies = pd.get_dummies(pd.cut(users_ref[label_type], numChild_bins, right=False), prefix=label_type) elif label_type == 'residence': continue else: users_ref[label_type + "_cat"] = pd.Categorical(users_ref[label_type], categories=categories.get(label_type)) temp_dummies = pd.get_dummies(users_ref[label_type + "_cat"], prefix=label_type) if base_dummies is None: base_dummies = temp_dummies else: base_dummies = pd.concat([base_dummies, temp_dummies], axis=1) label_nums = base_dummies.sum() label_rates = label_nums / float(len(users_ref)) return base_dummies, label_nums, label_rates
def procc_testset(clf, age_scale_param, fare_scale_param): import sklearn.preprocessing as preprocessing import numpy as np scaler = preprocessing.StandardScaler() data_test = pd.read_csv("test.csv") data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0 # 接着我们对test_data做和train_data中一致的特征变换 # 首先用同样的RandomForestRegressor模型填上丢失的年龄 tmp_df = data_test[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']] null_age = tmp_df[data_test.Age.isnull()].as_matrix() # 根据特征属性X预测年龄并补上 X = null_age[:, 1:] predictedAges = rfr.predict(X) data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges data_test = set_Cabin_type(data_test) dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin') dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked') dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex') dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass') df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1) df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param) df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param) df_test.head(8) ############ test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') predictions = clf.predict(test) result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived': predictions.astype(np.int32)}) result.to_csv("logistic_regression_predictions.csv", index=False)
data['thalassemia'][data['thalassemia'] == 0] = 'reversable defect' x = data.iloc[:, 0:13].values x = pd.DataFrame(x) y = data.iloc[:, 13].values x.columns = [ 'age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved', 'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia' ] x = pd.get_dummies(x, columns=[ 'sex', 'chest_pain_type', 'fasting_blood_sugar', 'rest_ecg', 'exercise_induced_angina', 'st_slope', 'thalassemia' ], drop_first=True) from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) #Feature scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test)
model.save('best_lstm_model.h5') del history del model gc.collect() """### Evaluation""" predY = np.average(submission_predictions, axis = 0, weights = [2**i for i in range(len(submission_predictions))]) # plot precision-recall-curve precision = dict() recall = dict() y_test_dummies = pd.get_dummies(testY, drop_first=False).values for i in range(3): precision[i], recall[i], _ = precision_recall_curve(y_test_dummies[:, i], predY[:, i]) plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i)) plt.xlabel("recall") plt.ylabel("precision") plt.legend(loc="best") plt.title("precision vs. recall curve") plt.show() plt.savefig('prc.png') # plot ROC import seaborn as sns class_to_label_map = ['normal', 'covid', 'pneumonia']
else: return 24 else: return Age train['Age'] = train[['Age', 'Pclass']].apply(impute_age, axis=1) sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis') train.drop('Cabin', axis=1, inplace=True) train.head() train.dropna(inplace=True) train.info() sex = pd.get_dummies(train['Sex'], drop_first=True) embark = pd.get_dummies(train['Embarked'], drop_first=True) train.drop(['Sex', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True) train = pd.concat([train, sex, embark], axis=1) train.head() # building logistic regression model # train test split from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived', axis=1), train['Survived'], test_size=0.30, random_state=101)
full.loc[full['build_year']==1,'build_year']=np.nan full.loc[full['build_year']==20,'build_year']=2000 full.loc[full['build_year']==215,'build_year']=2015 full.loc[full['build_year']==3,'build_year']=np.nan full.loc[full['build_year']==2,'build_year']=np.nan full.loc[full['build_year']==71,'build_year']=np.nan full.loc[full['build_year']==4965,'build_year']=np.nan #对sub_area进行重新划分 # full.loc[full['sub_area']==''] full.drop(["id", "timestamp", "price_doc"], axis=1,inplace=True) #之前是289列,get_dummies之后是451列 full=pd.get_dummies(full,columns=col_object) #模型调参 def get_model(estimator, parameters, X_train, y_train, scoring): model = GridSearchCV(estimator, param_grid=parameters, scoring=scoring) model.fit(X_train, y_train) return model.best_estimator_ # # X=full[full.floor.notnull()].drop('floor',axis=1) # y=full[full.floor.notnull()].floor # # X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=2017) # XGB = xgb.XGBRegressor(max_depth=4, seed= 2017)
y = dataset[:, -1] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) #print(pd.DataFrame(y_train).hist(bins=4)) pd.Series(y_train).value_counts(bins=4) pd.Series(y_test).value_counts(bins=4) from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) #one-hot encoded output classes y_train = pd.get_dummies(y_train).values y_test = pd.get_dummies(y_test).values #test1=test.values #X_test=test1[:,0:-1] #y_test=test1[:,-1] # Designing of the model from keras.callbacks import EarlyStopping from keras.models import Sequential from keras.layers import Dense, BatchNormalization, Dropout from keras.layers.embeddings import Embedding from keras.layers import Input from keras.models import Model from keras.optimizers import Adadelta, SGD, Adam, RMSprop input_img = Input(shape=(36, ))
def prepare_data(): # Copied wti200's kernel: from https://www.kaggle.com/wti200/deep-neural-network-for-starters-r excluded = get_excluded() df_train = pd.read_csv("../input/train.csv", parse_dates=['timestamp']) df_test = pd.read_csv("../input/test.csv", parse_dates=['timestamp']) #------------------------------------- # Note that the following is essential to get good performance: # You can produce these pkl by using the two kernels with 5 fold non-shuffle traning, as we usually did in stacking #------------------------------------- # https://www.kaggle.com/schoolpal/lgbm-lb-0-3093-0-3094 # https://www.kaggle.com/schoolpal/modifications-to-reynaldo-s-script # (xgb_train,xgb_test)=pickle.load(open('xgb_predicted.pkl')) # (xgb_train_log,xgb_test_log)=pickle.load(open('xgb_predicted_log.pkl')) # (lgb_train,lgb_test)=pickle.load(open('lgb_predicted.pkl')) # df_train['xgb_score']=xgb_train # df_train['log_xgb_score']=np.log(xgb_train) # df_train['lgb_score']=lgb_train # df_train['lgb_score_log']=np.log(lgb_train) ## df_train['log_xgb_score']=xgb_train_log ## df_train['log_xgb_score_log']=np.log(xgb_train_log) # df_test['xgb_score']=xgb_test # df_test['log_xgb_score']=np.log(xgb_test) # df_test['lgb_score']=lgb_test # df_test['lgb_score_log']=np.log(lgb_test) ## df_test['log_xgb_score']=xgb_test_log ## df_test['log_xgb_score_log']=np.log(xgb_test_log) # Magic number from Andy's script (Louis?) df_train['price_doc'] *= 0.969 full_sq = df_train.full_sq.copy() full_sq[full_sq < 5] = np.NaN price_sq = df_train.price_doc / full_sq #Remove the extreme prices, took from someone's kernel (sry) df_train = df_train[(price_sq < 600000) & (price_sq > 10000)] price_sq = price_sq[(price_sq < 600000) & (price_sq > 10000)] y_train = df_train.price_doc df_train.drop(['price_doc'], inplace=True, axis=1) num_train = df_train.shape[0] da = pd.concat([df_train, df_test]) da = da.reset_index(drop=True) ''' The feature enginering part, most of the FE were took from other peole's kernel. last_days method adds the mean of full_sq for all the house sold in last 30 days. This feature was motivated from my autoregression model for monthly prices. What does this feature capture? I tried daily sum of full_sq which clearly indicates the supply and demand. However, the local CV results of monthly price prediction actually prefer mean! I think this feature somehow captured the supply and demand for luxury or economic properties. ''' da = last_days(da) # These two features are only necessary as I removed the outlier feature values (> 4 SD) for all features, but these two are important to keep. da['build_year1'] = ((da['build_year'] == 1) & (da.product_type == 'OwnerOccupier')).astype(int) da['build_year0'] = ((da['build_year'] == 0) & (da.product_type == 'OwnerOccupier')).astype(int) # Fill some missing values based on location (Bhavesh Ghodasara's idea for # identify location) da = fill_years(da) da = fill_maxfloor(da) # Not necessary, I just fix it in order to calculate price per square meter for the sample weights da.loc[da['life_sq'] < 5, 'life_sq'] = np.NaN da.loc[da['full_sq'] < 5, 'full_sq'] = np.NaN # 0.7 come from the mean ratio (0.65?) between full_sq and life_sq,0.65 also works da['life_sq'] = np.where(da.life_sq.isnull(), da.full_sq * 0.7, da.life_sq) da['build_year'] = np.where( (da.build_year > 1690) & (da.build_year < 2020), da.build_year, np.NaN) da['max_floor'] = np.where(da.max_floor < da.floor, da.floor + 1, da.max_floor) da['material'] = da['material'].astype(str) da.loc[da.state == 33, 'state'] = 3 to_remove = [] product_types = pd.factorize(da.product_type)[0] product_types_string = da.product_type.copy() da['month'] = da.timestamp.dt.year.astype(str) # The year_month feature was added to nullify the effect of # "year_month" as I set the year_month of the test data to be NaN # I hope to nullify any effect of time. This is equivalent to say that we don't know the time for test data. # Any time effect must be learned from macro feature da['year_month'] = da.timestamp.dt.year da['year_month'] = (da['year_month'] * 100 + da.timestamp.dt.month) da.loc[da['year_month'] > 201506, 'year_month'] = np.NaN da['year_month'] = da['year_month'].astype(str) df_cat = None for c in da.columns: if da[c].dtype == 'object': oh = pd.get_dummies(da[c], prefix=c) if df_cat is None: df_cat = oh else: df_cat = pd.concat([df_cat, oh], axis=1) to_remove.append(c) da.drop(to_remove, inplace=True, axis=1) # Remove rare one hot encoded features to_remove = [] if df_cat is not None: sums = df_cat.sum(axis=0) to_remove = sums[sums < 200].index.values df_cat = df_cat.loc[:, df_cat.columns.difference(to_remove)] da = pd.concat([da, df_cat], axis=1) if excluded is not None: for c in excluded: if c in da.columns: da.drop([c], inplace=True, axis=1) # These additional features are taken from # https://www.kaggle.com/wti200/deep-neural-network-for-starters-r da['na_count'] = da.isnull().sum(axis=1) da['rel_floor'] = da.floor / da.max_floor da['diff_floor'] = da.max_floor - da.floor da['rel_kitchen_sq'] = da.kitch_sq - da.full_sq da['rel_life_sq'] = da.life_sq / da.full_sq da['rel_kitch_life'] = da.kitch_sq / da.life_sq da['rel_sq_per_floor'] = da.full_sq / da.floor da['diff_life_sq'] = da.full_sq - da.life_sq da['building_age'] = da.timestamp.dt.year - da.build_year da['new_house_own'] = ( (da['building_age'] <= 0) & (product_types_string == 'OwnerOccupier')).astype(int) da['old_house_own'] = ( (da['building_age'] > 0) & (product_types_string == 'OwnerOccupier')).astype(int) # Macro features, finally!!! # The unemployment info for 2016 was missing. So the unemployment rate were taken from OCED website # The original unemployment data is useful, but OCED's data is better (LB score) # These macro features are selected from my autoregresion time series model # for the monthly mean prices based on the local CV results. "eurrub" and "brent" for Investment properties, and "unemployment" for OwerOccupier. macro_cols = ['timestamp', 'brent', 'eurrub', 'unemployment'] macro = pd.read_csv('../input/macro.csv', parse_dates=['timestamp']) # Load the OCED unemployment # macro=macro_lib.fix(macro) macro = macro.loc[:, macro_cols] da = da.join(macro.set_index('timestamp'), on='timestamp') da[da == np.inf] = np.NaN if 'index' in da.columns: da.drop(['index'], inplace=True, axis=1) # Give tax-purpose properties a very low sample weights sample_weights = bad_weights(df_train, y_train, price_sq) train = da[:num_train].drop(['timestamp', 'id'], axis=1) test = da[num_train:].drop(['timestamp', 'id'], axis=1) # identify the binary features for excluding them from scaling bin_inds = [] for c in train.columns: if train.loc[:, c].unique().shape[0] == 2 and train.loc[:, c].unique( ).sum() == 1: bin_inds.append(train.columns.get_loc(c)) return train, test, y_train, da[num_train:].id, bin_inds, sample_weights
def basic_preprocess(train_complete, test_complete, out_column, drop_columns=None, forced_categorical = None, forced_numeric = None, columns_to_normalize = None, use_labeler = None, manual_processing = None, seed=42, perc=10): complete_features = pd.concat([train_complete, test_complete], sort=False).reset_index(drop=True) train = train_complete.copy() test = test_complete.copy() normalize_output = columns_to_normalize and out_column in columns_to_normalize if normalize_output: columns_to_normalize.remove(out_column) if use_labeler: if not columns_to_normalize: columns_to_normalize = [] for column in use_labeler: if column in columns_to_normalize: columns_to_normalize.remove(column) convert_dict = {} if forced_categorical: for column in forced_categorical: convert_dict[column] = 'str' if forced_numeric: for column in forced_numeric: convert_dict[column] = 'float64' train = train.astype(convert_dict) test = test.astype(convert_dict) if drop_columns: train.drop(drop_columns, axis=1, inplace=True) test.drop(drop_columns, axis=1, inplace=True) train_data = np.array(train[out_column]) if normalize_output: normalize, denormalize = transform_distribution(train_data) else: normalize = lambda x: x denormalize = lambda x: x y = np.array(normalize(train_data)) train_features = train.drop([out_column], axis=1) features = pd.concat([train_features, test], sort=False).reset_index(drop=True) impute_with_mode(features) numerics = list(features.select_dtypes(include=[np.number]).columns.values) if len(numerics) >= 2: imp = IterativeImputer(max_iter=10, sample_posterior=False, random_state=seed) imp.fit(features[numerics]) features[numerics] = imp.transform(features[numerics]) elif numerics: impute_with_median(features) if use_labeler: labeler = LabelEncoder() for column in use_labeler: features[column] = labeler.fit_transform(features[column]) final_features = pd.get_dummies(features).reset_index(drop=True) if columns_to_normalize: normalize_columns(final_features, columns_to_normalize) if manual_processing: final_features = manual_processing(final_features, complete_features) X = final_features.iloc[:len(y), :] X_sub = final_features.iloc[len(X):, :] #print('selecting relevant features') #X, X_sub = select_features(X, y, X_sub, final_features.columns, perc=perc) return X, y, X_sub, denormalize
# Box Cox Transformation of (highly) skewed features # We use the scipy function boxcox1p which computes the Box-Cox transformation of 1+x . # Note that setting λ=0 is equivalent to log1p used above for the target variable. skewness = skewness[abs(skewness) > 0.75] print("There are {} skewed numerical features to Box Cox transform".format( skewness.shape[0])) from scipy.special import boxcox1p skewed_features = skewness.index lam = 0.15 for feat in skewed_features: # all_data[feat] += 1 all_data[feat] = boxcox1p(all_data[feat], lam) # Getting dummy categorical features all_data = pd.get_dummies(all_data) print(all_data.shape) # Getting the new train and test sets. train = all_data[:ntrain] test = all_data[ntrain:] #Validation function n_folds = 5 def rmsle_cv(model): kf = KFold( n_folds, shuffle=True, random_state=42).get_n_splits(train.values) rmse = np.sqrt(-cross_val_score( model, train.values, y_train, scoring="neg_mean_squared_error", cv=kf)) print("rmse", rmse)
for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session train_data = pd.read_csv("C:/01_Projects/09_CriticalFormulasandTools/PythonScripts/TitanicData/train.csv") train_data.head() test_data = pd.read_csv("C:/01_Projects/09_CriticalFormulasandTools/PythonScripts/TitanicData/test.csv") test_data.head() y = train_data["Survived"] features = ["Pclass", "Sex", "Fare", "Age"] X = pd.get_dummies(train_data[features]) X_test = pd.get_dummies(test_data[features]) from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import confusion_matrix from sklearn.impute import SimpleImputer my_imputer = SimpleImputer() X = my_imputer.fit_transform(X) X_test = my_imputer.fit_transform(X_test) model1 = GaussianNB() model1.fit(X, y) model2 = RandomForestClassifier(max_depth=15, n_estimators=100, bootstrap=False, max_features= 'sqrt', min_samples_leaf=4, min_samples_split=10)
# Cat conversion for c in train.columns: if train[c].dtype == 'object': lbl = LabelEncoder() lbl.fit(list(train[c].values) + list(test[c].values)) train[c] = lbl.transform(list(train[c].values)) test[c] = lbl.transform(list(test[c].values)) y=train['y'] train.drop(['y'],inplace=True,axis=1) combine=pd.concat([train,test]) columns=['X1','X2','X3','X4','X5','X6','X8'] for column in columns: temp=pd.get_dummies(pd.Series(combine[column])) combine=pd.concat([combine,temp],axis=1) combine= combine.drop([column], axis=1) # Define some useful functions train=combine[:train.shape[0]] test=combine[train.shape[0]:] def df_column_uniquify(df): df_columns = df.columns new_columns = [] for item in df_columns: counter = 0 newitem = item
test_path = '~/Downloads/hacker_rank/Dataset/Test.csv' raw_test_df = pd.read_csv(test_path) index_column = 'Employee_ID' train_index = raw_train_df.pop(index_column) test_index = raw_test_df.pop(index_column) # Merging both train and test df = pd.concat([raw_train_df, raw_test_df], ignore_index=True) # Categorical column categorical_columns = [ 'Gender', 'Relationship_Status', 'Hometown', 'Unit', 'Decision_skill_possess', 'Compensation_and_Benefits' ] df = pd.get_dummies(df, columns=categorical_columns) # print(df.isna().sum()) # Imputation for Time_of_service # Time_of_service is related to Time_since_promotion ptable = df.pivot_table(values='Time_of_service', index='Time_since_promotion', aggfunc=np.mean) def get_element(x): index = int(x['Time_since_promotion']) return ptable.loc[index].values[0] df['Time_of_service'].fillna(df[df['Time_of_service'].isnull()].apply(
def display_stacked_cat_bar(df, groupby, on, order=None, unit=None, palette=None, horizontal=True, figsize=(11, 11)): """ Displays a stacked bar plot given two categorical variables :param df: DataFrame to display data from :param groupby: Column name by which bars would be grouped :param on: Column name of the different bar blocks :param order: Order in which to draw the bars by :param unit: Scale to which unit :param palette: Color palette to use for drawing :param horizontal: Horizontal or vertical barplot :param figsize: Figure size :return: matplotlib.Axis object """ # Create a binary dataframe stacked_bar_df = pd.concat([df[groupby], pd.get_dummies(df[on])], axis=1) bins = list(stacked_bar_df.columns[1:]) stacked_bar_df = stacked_bar_df.groupby(groupby)[bins].sum().reset_index() if order: if not isinstance(order, list): raise ValueError('"order" must be a list') if set(order) != set(bins): raise ValueError( '"order" iterable must contain all possible values: {}'.format( str(bins))) stacked_bar_df = stacked_bar_df[[groupby] + order] bins = order # Scale if given unit if unit: # Calculate total stacked_bar_df['total'] = stacked_bar_df[bins].sum(axis=1) # Scale for bin_label in bins: stacked_bar_df[bin_label] /= stacked_bar_df['total'] stacked_bar_df[bin_label] *= unit # Drop irrelevant 'total' column stacked_bar_df = stacked_bar_df.iloc[:, :-1] # Cumsum row wise for idx in range(1, len(bins)): stacked_bar_df[bins[idx]] = stacked_bar_df[bins[idx]] + stacked_bar_df[ bins[idx - 1]] # Get relevant palette if palette: palette = palette[:len(bins)] else: palette = sns.color_palette()[:len(bins)] # Plot fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) if horizontal: for color, bin_label in reversed(list(zip(palette, bins))): sns.barplot(y=groupby, x=bin_label, data=stacked_bar_df, color=color, label=bin_label, ax=ax) else: for color, bin_label in reversed(list(zip(palette, bins))): sns.barplot(x=groupby, y=bin_label, data=stacked_bar_df, color=color, label=bin_label, ax=ax) ax.legend(bbox_to_anchor=(1.04, 1), loc='upper left') if unit: if horizontal: ax.set(xlim=(0, unit)) else: ax.set(ylim=(0, unit)) if horizontal: ax.set(xlabel='') else: ax.set(ylabel='') return ax
def cleanpy(cols, changetype, encodecol, scaling, scalingcol, targetcol, dftest, cleandatapath, rawdatapath): import pandas as pd import numpy as np from sklearn import preprocessing import os cols = cols changetype = changetype encodecol = encodecol scaling = scaling scalingcol = scalingcol targetcol = [targetcol] dftest = "" df = pd.read_csv(rawdatapath) #feature scaling if (scalingcol[0] != "none"): if scaling == 'standarization': for feature in scalingcol: df[feature] = (df[feature] - df[feature].mean()) / (df[feature].std()) else: x = df[scalingcol].values #returns a numpy array min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) df[scalingcol] = x_scaled #encoding le = preprocessing.LabelEncoder() if (encodecol[0] != "none"): if changetype == "labelencode": featurex = df[encodecol] featurex = featurex.apply(le.fit_transform) features = featurex.columns for feature in features: df.drop([feature], axis=1, inplace=True) df = pd.concat([df, featurex[feature]], axis=1) else: dummy = pd.get_dummies(df[encodecol]) df = pd.concat([df, dummy], axis=1) df.drop(encodecol, axis=1, inplace=True) if df[df[targetcol].columns[0]].dtype == object: featurex = df[targetcol] featurex = featurex.apply(le.fit_transform) features = featurex.columns for feature in features: df.drop([feature], axis=1, inplace=True) df = pd.concat([df, featurex[feature]], axis=1) # drop columns if (cols[0] != "none"): df = df.drop(cols, axis=1) # mandatory cleaning # removing rows having null values df.dropna(inplace=True) # try to convert all non-numeric values to numeric if possible df = df.infer_objects() # removing columns having object type values as it will create problem in model creation removecol = df.select_dtypes(include=['object']).columns df.drop(labels=removecol, axis=1, inplace=True) #test data creation if dftest == "": msk = np.random.rand(len(df)) < 0.75 dftrain = df[msk] dftest = df[~msk] else: dftrain = df #target variable seperation ytrain = pd.DataFrame(dftrain[targetcol]) ytest = pd.DataFrame(dftest[targetcol]) dftrain.drop(targetcol, axis=1, inplace=True) dftest.drop(targetcol, axis=1, inplace=True) dftrain.to_csv(cleandatapath + "dftrain.csv", index=None) dftest.to_csv(cleandatapath + "dftest.csv", index=None) ytrain.to_csv(cleandatapath + "ytrain.csv", index=None) ytest.to_csv(cleandatapath + "ytest.csv", index=None)
def read_compas(filename=os.path.join( conf.datadir, "compas-analysis/compas-scores-two-years.csv"), smlfeatures=False, return_all=False, single_S=False): #read compas dataset file (numeric ver) lines = [ line for line in open(filename, "r").readlines() if line.find("?") == -1 ] fo = open(filename, "w") for line in lines: fo.write(line) fo.close() #pd.set_option("display.max_rows", 100) #pd.set_option("display.max_colwidth", 100) #print dir(pd) data = pd.read_csv(filename, sep=',') int_values = [ "age", "juv_fel_count", "decile_score", "juv_misd_count", "juv_other_count", "v_decile_score", "priors_count" ] #,"is_recid" #string_values = ["sex","race","two_year_recid","c_charge_degree","c_charge_desc"] string_values = [ "sex", "two_year_recid", "type_of_assessment", "v_type_of_assessment" ] #,"r_charge_desc"] date_values = [ "c_jail_in", "c_jail_out", "c_offense_date", "screening_date", "in_custody", "out_custody" ] my_attrs = [] for int_val in int_values: my_attrs.append(data[int_val]) for string_val in string_values: my_attrs.append( pd.get_dummies(data[string_val], prefix=string_val, drop_first=True)) for date_val in date_values: temp = pd.to_datetime(data[date_val]) t_min, t_max = min(temp), max(temp) my_attrs.append((temp - t_min) / (t_max - t_min)) new_data = pd.concat(my_attrs, axis=1) new_data["African-American"] = (data["race"] == "African-American") new_data = new_data.dropna() if return_all: return new_data new_data.insert(0, "intercept", 1) corr_akey = [] for akey in new_data.keys(): corr_akey.append((np.corrcoef(new_data[akey], new_data["two_year_recid_1"])[0, 1], akey)) if single_S: S_keys = ["sex_Male"] else: S_keys = ["sex_Male", "African-American"] #race_Native American race_Asian race_Other race_Hispanic race_Caucasian S = np.transpose([list(new_data[i]) for i in S_keys]) #S = np.array(S, dtype=np.int_)*2-1 y = [v * 2.0 - 1.0 for v in new_data["two_year_recid_1"]] X_keys = set(new_data.keys()).difference([] + S_keys) X_keys_nonrace = set() for akey in X_keys: if akey.find("race") != 0: X_keys_nonrace.add(akey) X_keys = X_keys_nonrace print("X_keys=", len(X_keys), X_keys) #print list(race.keys()) #X2_keys = set() X2_keys = set(["intercept"]).intersection(X_keys) print("X2 keys=", X2_keys) X2 = np.transpose([list(new_data[i]) for i in X2_keys]) #print("X2=",str(X2)) X2 = np.array(X2).reshape([len(new_data), len(X2_keys)]) #print "X2=",X2.shape #print "X2=",X2 X1_keys = X_keys.difference(X2_keys.union(set(["two_year_recid_1"]))) if smlfeatures: X1_keys = X1_keys.difference( set([ "out_custody", "decile_score", "in_custody", "c_jail_out", "c_jail_in", "screening_date", "v_decile_score" ])) X1 = np.transpose([list(new_data[i]) for i in X1_keys]) print("X1 keys=", X1_keys) #sys.exit() #print "S=",S[:10] return np.array(S), np.array(X1), np.array(X2), np.array(y)
print("Sequential modeli çıkart...") # The maximum number of words to be used. (most frequent) MAX_NB_WORDS = 50000 # Max number of words in each complaint. MAX_SEQUENCE_LENGTH = 250 # This is fixed. EMBEDDING_DIM = 100 tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) tokenizer.fit_on_texts(data['Icerik'].values) X = tokenizer.texts_to_sequences(data['Icerik'].values) X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH) Y = pd.get_dummies(data['Kategori']).values xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.05, random_state=42) model = Sequential() model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1])) model.add(SpatialDropout1D(0.2)) model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) model.add(Dropout(0.25)) model.add(Dense(2, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) epochs = 5 batch_size = 128 history = model.fit(xTrain, yTrain, epochs=epochs, batch_size=batch_size, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]) acc = model.evaluate(xTest, yTest) print(acc[1])
df = pd.read_csv('data/titanic.csv') # Descrevendo o dataset df.info() # Visualizando o dataset df.head(5) # Deletando as features que não tem importância no modelo: Nome, Código do Ticket e Código da Cabine: df = df.drop(['Name','Ticket','Cabin', 'PassengerId'], axis = 1) # Preenchendo os valores númericos nulos (NA) com a mediana. df = df.fillna(df.median()) # Criando variaǘeis Dummy nas variáveis categóricas df = pd.get_dummies(df ,prefix=['Sex', 'Embarked'], drop_first=True) #Visualizando o dataset tratado: df.head(10) # Definindo as variáveis dependentes/independentes. X = df.iloc[:, 1:].values y = df.iloc[:, 0].values # Criando os subconjuntos de treinamento e testes X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Normalizando as features X_train = feature_scaling(X_train) X_test = feature_scaling(X_test)
#Get Valid Columns filter_col = [ col for col in df if col.startswith('Claim') or col.startswith('Past') ] filter_col.remove('ClaimAmount') X1 = df[[ 'MonthNumber', 'MonthYear', 'MinDate', 'TotalDays', 'DaysInPolicy', 'Species', 'Breed', 'AgeAtEnroll', 'MinAgeInDays', 'MaxAgeInDays', 'TotalDaysInPolicy', 'TotalMonthsInPolicy' ]] X2 = df[filter_col] XCombined = X1.join(X2) #One Hot Encode categorical variables X = pd.get_dummies(XCombined, prefix_sep="_", columns=['Breed', 'Species', 'AgeAtEnroll']) y = df[['MinDate', 'ClaimAmount', 'PetId']] #Create Train/Test Splits #Test on previous year for accuracy date = pd.Timestamp(2018, 7, 1) X_train = X.loc[X['MinDate'] < date] y_train = y.loc[y['MinDate'] < date] X_test = X.loc[X['MinDate'] == date] y_test = y.loc[y['MinDate'] == date] X_train.drop('MinDate', axis='columns', inplace=True) X_test.drop('MinDate', axis='columns', inplace=True)
f.write('NA,NA,140000\n') # load the raw dataset from the created csv file # if pandas is not installed, just uncomment the following line: # !pip install pandas import pandas as pd data = pd.read_csv(data_file) print(data) ############### 2.2.2. Handling Missing Data ############### # NaN값은 missing values다. 이 값을 처리하기 위해서는 값을 채워 넣거나 삭제할 수 있다. inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2] inputs = inputs.fillna(inputs.mean()) print(inputs) # Alley 컬럼의 경우 Alley_Pave와 Alley_NaN를 1과 0으로 나눔 inputs = pd.get_dummies(inputs, dummy_na=True) ############### 2.2.3. Conversion to the ndarray Format ############### # inputs과 outputs이 numerical이라면 ndarray format이 가능하다 from mxnet import np X, y = np.array(inputs.values), np.array(outputs.values) X y ############### 2.2.4. Summary ############### # like many other extension packages in the vast ecosystem of Python, pandas can work together with ndarray # imputation and deletion can be used to handle missing data
def data_preparation(self, df5): ## 5.1. Data normalization ## 5.2. Data rescaling # Before choosing which rescale method will be used, we must know which variables have outliers. #sns.boxplot(df5['competition_distance']) # competition distance df5['competition_distance'] = self.competition_distance_scaler.fit_transform( df5[['competition_distance']].values) # competition time month df5['competition_time_month'] = self.time_month_scaler.fit_transform( df5[['competition_time_month']].values) # year df5['year'] = self.year_scaler.fit_transform(df5[['year']].values) # promo time week df5['promo_time_week'] = self.promo_time_week_scaler.fit_transform( df5[['promo_time_week']].values) #sns.distplot(df5['competition_distance']) ## 5.3. Data transformation ### 5.3.1. Encoding #df5.select_dtypes('object') # state holiday - One hot encoding df5 = pd.get_dummies(df5, prefix=['state_holiday'], columns=['state_holiday']) # store type - Label Encoder df5['store_type'] = self.encoding_store_type.fit_transform( df5['store_type']) # assortment - Ordinal Encoder assortment_dict = {'basic': 1, 'extra': 2, 'extended': 3} df5['assortment'] = df5['assortment'].map(assortment_dict) ### 5.3.1. Nature Transformation # day of week df5['day_of_week_sin'] = df5['day_of_week'].apply( lambda x: np.sin(x * (2 * np.pi / 7))) df5['day_of_week_cos'] = df5['day_of_week'].apply( lambda x: np.cos(x * (2 * np.pi / 7))) # day df5['day_sin'] = df5['day'].apply(lambda x: np.sin(x * (2 * np.pi / 30))) df5['day_cos'] = df5['day'].apply(lambda x: np.cos(x * (2 * np.pi / 30))) # month df5['month_sin'] = df5['month'].apply( lambda x: np.sin(x * (2 * np.pi / 12))) df5['month_cos'] = df5['month'].apply( lambda x: np.cos(x * (2 * np.pi / 12))) # week of year df5['week_of_year_sin'] = df5['week_of_year'].apply( lambda x: np.sin(x * (2 * np.pi / 52))) df5['week_of_year_cos'] = df5['week_of_year'].apply( lambda x: np.cos(x * (2 * np.pi / 52))) cols_selected = [ 'store', 'promo', 'store_type', 'assortment', 'competition_distance', 'competition_open_since_month', 'competition_open_since_year', 'promo2', 'promo2_since_week', 'promo2_since_year', 'competition_time_month', 'promo_time_week', 'day_of_week_sin', 'day_of_week_cos', 'day_sin', 'day_cos', 'month_cos', 'month_sin', 'week_of_year_cos', 'week_of_year_sin' ] return df5[cols_selected]
# AGM #data = data.drop (columns=[ #'flowStartMilliseconds', #'sourceIPAddress', #'mode(destinationIPAddress)', #'mode(_tcpFlags)', #'Label', #'Attack' ]) #nominalFeatures = ['mode(sourceTransportPort)', 'mode(destinationTransportPort)', 'mode(protocolIdentifier)'] for nominal in nominalFeatures: freqValues = list(data[nominal].value_counts().iloc[:10].keys()) data.loc[~data[nominal].isin(freqValues),nominal] = np.nan data = pd.get_dummies (data, columns = nominalFeatures, drop_first = True, dtype=np.float64) columns = list(data.columns) allNominal = [] nominalColumns = {} for feat in nominalFeatures: nominalColumns[feat] = [ i for i in range(len(columns)) if columns[i].startswith(feat+'_') ] allNominal += nominalColumns[feat] vec = [False] * len(columns) notAdded = [ ind for ind in range(len(columns)) if ind not in allNominal ] notAddedNominal = nominalFeatures[:] # downsample to 5% _, data, _, labels = train_test_split (data, labels, test_size=0.05, random_state = 1, stratify=attacks) data = minmax_scale (data)
### Pre-processing # Creation of weekend variable from dayofweek variable print(df.groupby(['dayofweek'])['loan_status'].value_counts(normalize=True),"\n") df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>0 and x<4) else 0) # Result print(df.weekend.value_counts(), "\n") print(df.groupby(['weekend'])['loan_status'].value_counts(normalize=True)) # Convert categorical features to numerical values df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True) ### Feature selection final_df = df[['loan_status','Principal','terms','age','Gender','weekend']] final_df = pd.concat([final_df,pd.get_dummies(df['education'])], axis=1) final_df.drop(['Master or Above'], axis = 1,inplace=True) # Reduced dataset wih only the most significant variables #final_df = df[['loan_status','age','Gender','weekend']] # Finalize dataset X = final_df.drop(columns = ['loan_status']) X = preprocessing.StandardScaler().fit(X).transform(X) y = final_df['loan_status'].values # Split between train and test sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
# Create a boxplot of life expectancy per region df.boxplot('life', 'Region', rot=60) # Show the plot plt.show() #Creating dummy variables # As Andy discussed in the video, scikit-learn does not accept non-numerical features. # You saw in the previous exercise that the 'Region' feature contains very useful information # that can predict life expectancy. For example, Sub-Saharan Africa has a lower life expectancy # compared to Europe and Central Asia. Therefore, if you are trying to predict life expectancy, # it would be preferable to retain the 'Region' feature. To do this, you need to binarize it by # creating dummy variables, which is what you will do in this exercise. # Create dummy variables: df_region df_region = pd.get_dummies(df) # Print the columns of df_region print(df_region.columns) # Create dummy variables with drop_first=True: df_region df_region = pd.get_dummies(df, drop_first=True) # Print the new columns of df_region print(df_region.columns) #Regression with categorical features # Having created the dummy variables from the 'Region' feature, you can build # regression models as you did before. Here, you'll use ridge regression to perform 5-fold cross-validation. # The feature array X and target variable array y have been pre-loaded.
def one_hot_encoder(df, nan_as_category=True): original_columns = list(df.columns) categorical_columns = [col for col in df.columns if df[col].dtype == 'object'] df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category) new_columns = [c for c in df.columns if c not in original_columns] return df, new_columns
intercolumnarDistance = dataset.iloc[:, 2].values upperMargin = dataset.iloc[:, 3].values lowerMargin = dataset.iloc[:, 4].values exploitation = dataset.iloc[:, 5].values rowNumber = dataset.iloc[:, 6].values modularRatio = dataset.iloc[:, 7].values interlinearSpacing = dataset.iloc[:, 8].values weight = dataset.iloc[:, 9].values peakNumber = dataset.iloc[:, 10].values yvalue_class = dataset.iloc[:, 11].values #Y value, Vectorize encoder = LabelEncoder() y = encoder.fit_transform(yvalue_class) Y = pd.get_dummies(y).values Y = np.array(Y) X = [] for a,b,c,d,e,f,g,h,i in zip(intercolumnarDistance, upperMargin, lowerMargin, exploitation, rowNumber, modularRatio, intercolumnarDistance, weight, peakNumber): X.append([a,b,c,d,e,f,g,h,i]) X = np.array(X) X_train, X_test, y_train, y_test = train_test_split(X, Y) model = Sequential() model.add(Dense(8, input_shape=(9, ), activation='softmax')) model.add(Dense(10, activation='tanh')) model.add(Dense(12, activation='relu'))
from sklearn import model_selection from sklearn.externals import joblib #returns current working directory os.getcwd() #changes working directory os.chdir("E:/") titanic_train = pd.read_csv("train.csv") #EDA titanic_train.shape titanic_train.info() #data preparation titanic_train1 = pd.get_dummies(titanic_train, columns=['Pclass', 'Sex', 'Embarked']) titanic_train1.shape titanic_train1.info() titanic_train1.head(6) #feature engineering X_train = titanic_train1.drop(['PassengerId','Age','Cabin','Ticket', 'Name','Survived'], 1) y_train = titanic_train['Survived'] #build the decision tree model dt = tree.DecisionTreeClassifier() #use cross validation to estimate performance of model. #No model build during cross validation is not used as final model cv_scores = model_selection.cross_val_score(dt, X_train, y_train, cv=10, verbose=1) cv_scores.mean()
#alter['ALTBE'].fillna('0', inplace=True) #l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTBE']]) #alter.loc[np.where(l == 'null')[0],'ALTBE'] = '0' #alter['ALTBE'] = np.array( # [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTBE']]) #alter['ALTAF'].fillna('0', inplace=True) #l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTAF']]) #alter.loc[np.where(l == 'null')[0],'ALTAF'] = '0' #alter['ALTAF'] = np.array( # [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTAF']]) #ALTDATE alter['alt_year'], alter['alt_month'] = splitDate(alter['ALTDATE']) alter_year_dummies = pd.get_dummies(alter['alt_year']) alter_month_dummies = pd.get_dummies(alter['alt_month']) addColumnsPrefix(alter_year_dummies, 'alter_year') addColumnsPrefix(alter_month_dummies, 'alter_month') alter_date_dummies = alter_year_dummies.join(alter_month_dummies) alter_date_dummies[['EID']] = alter[['EID']] alter_date_dummies = alter_date_dummies.groupby('EID').sum() alter_date_dummies.reset_index(inplace=True) #ALTERNO alterno_dummies = pd.get_dummies(X_alter['ALTERNO']) alterno_dummies[['EID']] = X_alter[['EID']] alterno_dummies = alterno_dummies.groupby('EID').sum() addColumnsPrefix(alterno_dummies, 'alterno') alterno_dummies.reset_index(inplace=True)
def transform(self,df): df = pd.get_dummies(df, columns = self.column_array) if self.column_array_drop_first != [] : df = pd.get_dummies(df, columns = self.column_array_drop_first, drop_first = True) return df
dep_air_train= (df_train_co[i,0],df_train_co[i,1]) arr_air_train= (df_train_co[i,2],df_train_co[i,3]) dist_train.append(vincenty(dep_air_train,arr_air_train).km) df_dist_train= pd.DataFrame({'Distance': dist_train}) # find vincenty distance to create new feature distance on test dist_test=[] for i in range(0,2229): dep_air_test= (df_test_co[i,0],df_test_co[i,1]) arr_air_test= (df_test_co[i,2],df_test_co[i,3]) dist_test.append(vincenty(dep_air_test,arr_air_test).km) df_dist_test= pd.DataFrame({'Distance': dist_test}) df_train.drop(df_train.columns[[0,2,3,4,6,7,8,11]], axis=1, inplace=True) df_train= pd.concat([df_train,pd.get_dummies(df_train['Departure'],prefix='Departure'),pd.get_dummies(df_train['Arrival'],prefix='Arrival')],axis=1) df_train.drop(['Departure'],axis=1,inplace=True) df_train.drop(['Arrival'],axis=1,inplace=True) df_train_w_dist= pd.concat([df_train,df_dist_train], axis=1) df_test.drop(df_test.columns[[0,2,3,4,6,7,8]], axis=1, inplace=True) df_test= pd.concat([df_test,pd.get_dummies(df_test['Departure'],prefix='Departure'),pd.get_dummies(df_test['Arrival'],prefix='Arrival')],axis=1) df_test.drop(['Departure'],axis=1,inplace=True) df_test.drop(['Arrival'],axis=1,inplace=True) df_test_w_dist= pd.concat([df_test,df_dist_test],axis=1) #special days tygiving='10-25' #thangsgiving mday='05-11' #mother's day iday='07-04' #independence day
os.mkdir(dirname) dirname = 'HandwritingVerification' os.mkdir('plots/%s' %dirname) if(not os.path.exists('plots/HandwritingVerification')): dirname = 'HandwritingVerification' os.mkdir('plots/%s' %dirname) dirname = 'Boosting' os.mkdir('plots/HandwritingVerification/%s' %dirname) if(not os.path.exists('plots/HandwritingVerification/Boosting')): dirname = 'Boosting' os.mkdir('plots/HandwritingVerification/%s' %dirname) df = pd.get_dummies(df) # print(df.head(5)) Z = df.ix[:, df.columns != 'CLASS_DISTINCT'] X = Z.ix[:, Z.columns != 'CLASS_SAME'] y = df['CLASS_DISTINCT'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=30) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test)
def cleaning_data(): # Importing the datasets portfolio = pd.read_json("portfolio.json", lines=True) profile = pd.read_json("profile.json", lines=True) transcript = pd.read_json("transcript.json", lines=True) # Data Cleaning of portfolio dataset ohe = { 'email': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'mobile': [1, 1, 1, 1, 0, 1, 1, 1, 1, 1], 'social': [1, 1, 0, 0, 0, 1, 1, 1, 1, 0], 'web': [0, 1, 1, 1, 1, 1, 1, 0, 1, 1] } ohx = pd.DataFrame(ohe, columns=['email', 'mobile', 'social', 'web']) cleaned_portfolio = portfolio cleaned_portfolio = pd.concat([portfolio, ohx], axis=1) cleaned_portfolio = cleaned_portfolio.drop(['channels'], axis=1) # converting duration from days to hours for better comparision cleaned_portfolio['duration'] = cleaned_portfolio['duration'] * 24 # one hot encoding the offer_type column ohe = pd.get_dummies(cleaned_portfolio['offer_type']) cleaned_portfolio = pd.concat([cleaned_portfolio, ohe], axis=1) cleaned_portfolio = cleaned_portfolio.drop(['offer_type'], axis=1) # renaming the id column to offer_id cleaned_portfolio = cleaned_portfolio.rename(columns={'id': 'offer_id'}) # Data Cleaning of profile dataset # To check the number of NULL values in each column # profile.isnull().sum() ''' gender 2175 age 0 id 0 became_member_on 0 income 2175 ''' # Also on checking the age column against all the pts having gender and income # as Null we find that the corresponding age value is 118 which is quite # unusual. So in order to cleanse the data we drop all such points. # Dropping NULL values cleaned_profile = profile cleaned_profile = cleaned_profile.dropna() # Renaming the id column to customer_id cleaned_profile = cleaned_profile.rename(columns={'id': 'person_id'}) # OneHotEncoding the gender column ohe = pd.get_dummies(cleaned_profile['gender']) cleaned_profile = pd.concat([cleaned_profile, ohe], axis=1) # To convert the became_member_on to date-time stamp because the machine will not # understand data corresponding to date in integer form. cleaned_profile['became_member_on'] = pd.to_datetime( cleaned_profile['became_member_on'], format='%Y%m%d').dt.date # We added a column today's date in the dataframe for refereence to calculate the no of days the customer has been a member of Starbucks cleaned_profile['today_date'] = pd.to_datetime('20200828', format='%Y%m%d') cleaned_profile['today_date'] = pd.to_datetime( cleaned_profile['today_date'], format='%Y%m%d').dt.date cleaned_profile['days_of_membership'] = cleaned_profile['today_date'].sub( cleaned_profile['became_member_on'], axis=0) # Taking a ratio of the subtracted dates to convert it into no.of.days cleaned_profile['days_of_membership'] = cleaned_profile[ 'days_of_membership'] / np.timedelta64(1, 'D') cleaned_profile['became_member_on'] = pd.to_datetime( cleaned_profile['became_member_on'], format='%Y-%m-%d').dt.year # Then we drop the reference column because it is not useful to us further analysis cleaned_profile = cleaned_profile.drop(['today_date'], axis=1) cleaned_profile['age_by_decade'] = pd.cut(cleaned_profile['age'], bins=range(10, 120, 10), right=False, labels=[ '10s', '20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s', '100s' ]) cleaned_profile['income_range'] = pd.cut(cleaned_profile['income'], bins=range(0, 120001, 10000), right=False, labels=[ '10k', '20k', '30k', '40k', '50k', '60k', '70k', '80k', '90k', '100k', '110k', '120k' ]) # Data Cleaning of transcript.json cleaned_transcript = transcript # OneHotEncoding the event column ohy = pd.get_dummies(cleaned_transcript['event']) cleaned_transcript = pd.concat([cleaned_transcript, ohy], axis=1) cleaned_transcript = cleaned_transcript.drop(['event'], axis=1) # To delete all the information of the people had NULL values qhich we previously dropped. profile118 = profile[profile['age'] == 118] id118 = profile118['id'] cleaned_transcript = cleaned_transcript[~cleaned_transcript['person']. isin(id118)] cleaned_transcript['record'] = cleaned_transcript.value.apply( lambda x: list(x.keys())[0]) cleaned_transcript['record_value'] = cleaned_transcript.value.apply( lambda x: list(x.values())[0]) cleaned_transcript.drop(['value'], axis=1, inplace=True) transactions = cleaned_transcript[cleaned_transcript.transaction == 1] offers = cleaned_transcript[cleaned_transcript.transaction != 1] # cleaning transactions transactions = transactions.drop( ['offer completed', 'offer viewed', 'offer received'], axis=1) transactions = transactions.drop(['transaction', 'record'], axis=1) transactions = transactions.rename(columns={'record_value': 'amount'}) transactions['amount_range'] = pd.cut( transactions['amount'], bins=range(0, 1150, 50), right=False, labels=[ '50', '100', '150', '200', '250', '300', '350', '400', '450', '500', '550', '600', '650', '700', '750', '800', '850', '900', '950', '1000', '1050', '1100' ]) # cleaning offers offers = offers.drop(['transaction', 'record'], axis=1) offers = offers.rename(columns={'record_value': 'offer_id'}) return cleaned_portfolio, cleaned_profile, offers, transactions
def loaddataset(self,path,module): df=pd.read_csv(path) subdf = df[['PassengerId','Pclass','Sex','Age','Embarked','Fare','SibSp','Parch']] SibSp=subdf['SibSp'] Parch=subdf['Parch'] # supplement Age Age=subdf['Age'].fillna(value=subdf.Age.mean()) Fare=subdf['Fare'].fillna(value=subdf.Fare.mean()) dummies_Sex=pd.get_dummies(subdf['Sex'],prefix='Sex') dummies_Embarked = pd.get_dummies(subdf['Embarked'], prefix= 'Embarked') dummies_Pclass = pd.get_dummies(subdf['Pclass'], prefix= 'Pclass') PassengerId=subdf['PassengerId'] # Age&Fare to Scaler scaler=MinMaxScaler() age_scaled=scaler.fit_transform(Age.values) fare_scaled=scaler.fit_transform(Fare.values) Age_Scaled=pd.DataFrame(age_scaled,columns=['Age_Scaled']) Fare_Scaled=pd.DataFrame(fare_scaled,columns=['Fare_Scaled']) if module=='train': self.trainlabel=df.Survived self.trainset=pd.concat([dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1) elif module=='test': self.testset=pd.concat([PassengerId,dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1)