def output_predictions(path,start,end,values): keys=[] df=read_csv(path) #start,end=test[0],test[1] criterion=df['Season'].map(lambda x:x in range(start,end+1)) df2=df[criterion] df2=df2.reset_index(drop=True) for index,row in df2.iterrows(): keys.append((row['Home/Neutral'],row['Visitor/Neutral'],row['Season'])) output=dict(zip(keys,values)) return output
# print "standard deviations of R values in the heart rate: ", standard_deviation #Calculating the square root of the mean of list of squared differences heart_measure["rmssd"] = np.sqrt(np.mean(heart_measure["RR_sqdiff"])) #Create a list of all values with RR_diff over 20 and 50 nn20 = [x for x in heart_measure["RR_diff"] if (heart_measure["RR_diff"]>20)] nn50 = [x for x in heart_measure["RR_diff"] if (heart_measure["RR_diff"]>50)] heart_measure["pnn20"] = float(len(nn20)) / float(len(heart_measure["RR_diff"])) # Calculate the proportion of NN20, NN50 intervals to all intervals heart_measure["pnn50"] = float(len(nn50)) / float(len(heart_measure["RR_diff"])) # Note the use of float(), because we don't want Python to think we want an int() and round the proportion to 0 or 1 # print "pNN20, pNN50:", pnn20, pnn50 return heart_measure if __name__ == "__main__": data = read_csv("data.csv") # plot_data(data, "Heart Rate Signal") frequency = 100 # This dataset has a given frequency of 100Hz window_size = 0.75 # one sided window size as a proportion of the sampling frequency data_new, heart_measures = detect_peak(data, frequency, window_size) heart_measures = calc_heart_rate(heart_measures, frequency) # print "bpm is: %0.01f" % bpm plt.title("Detected Peaks in Heart Rate Signal") plt.xlim(0, 2500) plt.plot(data.hart, alpha=0.5, color="blue", label="raw signal") # aplha sets the transparency level plt.plot(heart_measures["moving_average"], color="black", ls="-.",
def main(*args): import pandas as pd import logging import numpy as np ################################################################## # preparations : arguments 변수 정의 및 logging 설정 # logging level : DEBUG, INFO, WARNING, ERROR, CRITICAL data_adrress = args[0] log_adrress = args[1] model_output_adrress = args[2] formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("log") logger.setLevel(logging.INFO) file_handler = logging.FileHandler(log_adrress + "/log.log") file_handler.setFormatter(formatter) logger.addHandler(file_handler) logging.basicConfig() logger.info("Program Start..") ################################################################## # Frist : Read data from the directory -> rawData import read_data logger.info("read data start..") rawData = read_data.read_csv(data_adrress) # 콘솔창 출력 column 확대 pd.set_option('display.max_columns', None) # pd.set_option('display.max_rows', None) #print(rawData_test) logger.info("read data end..") ################################################################## # Second : Preprocess data from rawData to Train & Test set # Step : 1.remove Outlier 2.Feature Scaling 3.Test/Train Split & Shuffle import preprocessing logger.info("preprocessing start..") # Proprocess rawData according to data characteristics # step 1 : remove outlier & feature scaling (from EDA or domain Knowledge) # X (Independent variable) & Y (Dependent variable) Split preprocessing_Data_X, preprocessing_Data_Y, x_test = preprocessing.preprocessing( rawData) x_train = preprocessing_Data_X y_train = preprocessing_Data_Y # print(x_train.shape, y_train.shape) logger.info("preprocessing end..") ################################################################## # Third : Build Model from sklearn.model_selection import KFold logger.info("build Model start..") import knn import logistic_regression import randomforest import gradientboosting from sklearn.model_selection import KFold num_folds = 5 num_instance = len(y_train) k_fold = KFold(n_splits=num_folds, shuffle=True) KFold(num_folds) x_train = x_train.values y_train = y_train.values.ravel() knn_model = knn.KNN_clf(10, x_train, y_train) logistic_model = logistic_regression.regression(x_train, y_train) randomforest_model = randomforest.randomforest(x_train, y_train, 20, 0) xgboost_model = gradientboosting.xgb(x_train, y_train, 0.01, 100) logger.info("build Model end..") ################################################################## # Fourth : Test & Tuning Model logger.info("test start..") from sklearn import metrics from sklearn.model_selection import cross_val_score y_pred_knn_model = knn_model.predict(x_train) print('knn_model 정확도 :', metrics.accuracy_score(y_train, y_pred_knn_model)) print( 'knn_model 정확도 (K-Fold) :', np.mean( cross_val_score(knn_model, x_train, y_pred_knn_model, cv=k_fold, scoring='neg_log_loss'))) y_pred_logistic_model = logistic_model.predict(x_train) print('logistic_model 정확도 :', metrics.accuracy_score(y_train, y_pred_logistic_model)) print( 'logistic_model 정확도 (K-Fold) :', np.mean( cross_val_score(logistic_model, x_train, y_pred_logistic_model, cv=k_fold, scoring='neg_log_loss'))) y_pred_randomforest_model = randomforest_model.predict(x_train) print('randomforest_model 정확도 :', metrics.accuracy_score(y_train, y_pred_randomforest_model)) print( 'randomforest_model 정확도 (K-Fold) :', np.mean( cross_val_score(randomforest_model, x_train, y_pred_randomforest_model, cv=k_fold, scoring='neg_log_loss'))) y_pred_xgboost_model = xgboost_model.predict(x_train) print(' xgboost_model 정확도 :', metrics.accuracy_score(y_train, y_pred_xgboost_model)) print( ' xgboost_model 정확도 (K-Fold) :', np.mean( cross_val_score(xgboost_model, x_train, y_pred_xgboost_model, cv=k_fold, scoring='neg_log_loss'))) y_pred_xgboost_model_final = xgboost_model.predict(x_test) logger.info("test end..") ################################################################## # Fifth : clear memory & Save Output logger.info("save start..") import joblib joblib.dump(xgboost_model, model_output_adrress + "./xgboost_model.pkl") logger.info("save end..") logger.info("Program End..")
import seaborn as sns import scipy as stats from matplotlib import rc import missingno as mano plt.style.use("ggplot") mpl.rcParams["axes.unicode_minus"] = False # data load import read_data # 콘솔창 출력 column 확대 pd.set_option('display.max_columns', None) train = read_data.read_csv( "C:/Users/whcl3/PycharmProjects/DataScience/Kaggle/titanic/train.csv") test = read_data.read_csv( "C:/Users/whcl3/PycharmProjects/DataScience/Kaggle/titanic//test.csv") # 2번째 column Name -> Title train_test_data = [train, test] for dataset in train_test_data: dataset["Title"] = dataset['Name'].str.extract('([A-Za-z]+)\.', expand=False) # 불필요한 Column 제거 # axis = 1 이면 column 삭제, 0이면 Row 삭제 test.drop('Name', axis=1, inplace=True)
] nn50 = [ x for x in heart_measure["RR_diff"] if (heart_measure["RR_diff"] > 50) ] heart_measure["pnn20"] = float(len(nn20)) / float( len(heart_measure["RR_diff"]) ) # Calculate the proportion of NN20, NN50 intervals to all intervals heart_measure["pnn50"] = float(len(nn50)) / float( len(heart_measure["RR_diff"]) ) # Note the use of float(), because we don't want Python to think we want an int() and round the proportion to 0 or 1 # print "pNN20, pNN50:", pnn20, pnn50 return heart_measure if __name__ == "__main__": data = read_csv("data.csv") # plot_data(data, "Heart Rate Signal") frequency = 100 # This dataset has a given frequency of 100Hz window_size = 0.75 # one sided window size as a proportion of the sampling frequency data_new, heart_measures = detect_peak(data, frequency, window_size) heart_measures = calc_heart_rate(heart_measures, frequency) # print "bpm is: %0.01f" % bpm plt.title("Detected Peaks in Heart Rate Signal") plt.xlim(0, 2500) plt.plot(data.hart, alpha=0.5, color="blue", label="raw signal") # aplha sets the transparency level plt.plot(heart_measures["moving_average"], color="black", ls="-.", label="moving average")
train_df, test_df, numerical_patterns, cat_patterns = read_data_ph1() predictors = numerical_patterns + cat_patterns categorical = cat_patterns is_val = (train_df['day'] == 9) & ((train_df['hour'] == 13) | (train_df['hour'] == 17) | (train_df['hour'] == 21)) val_df = train_df[is_val] train_df = train_df[~is_val] auc = model_lib.Predict(train_df, val_df, test_df, predictors, categorical, seed=get_opt('seed', 2018)) print('validation auc:', auc) test_df = test_df[['pred']].rename(columns={'pred': 'is_attributed'}) mapping = read_csv('../input/mapping.csv') click_id = read_csv('../input/sample_submission.csv', usecols=['click_id']) test_df = test_df.reset_index().merge(mapping, left_on='index', right_on='old_click_id', how='left') test_df = click_id.merge(test_df, on='click_id', how='left') outfile = '../csv/pred_test_' + target + '.csv' print('writing to', outfile) test_df[['click_id', 'is_attributed']].to_csv(outfile, index=False)
def main(*args): import pandas as pd import logging ################################################################## # preparations : arguments 변수 정의 및 logging 설정 # logging level : DEBUG, INFO, WARNING, ERROR, CRITICAL data_adrress = args[0] log_adrress = args[1] model_output_adrress = args[2] formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("log") logger.setLevel(logging.INFO) file_handler = logging.FileHandler(log_adrress + "/log.log") file_handler.setFormatter(formatter) logger.addHandler(file_handler) logging.basicConfig() logger.info("Program Start..") ################################################################## # Frist : Read data from the directory -> rawData import read_data logger.info("read data start..") rawData = read_data.read_csv(data_adrress) # 콘솔창 출력 column 확대 pd.set_option('display.max_columns', None) # pd.set_option('display.max_rows', None) #print(rawData) logger.info("read data end..") ################################################################## # Second : Preprocess data from rawData to Train & Test set # Step : 1.remove Outlier 2.Feature Scaling 3.Test/Train Split & Shuffle import preprocessing logger.info("preprocessing start..") # Proprocess rawData according to data characteristics # For regression (ex. Demand/Time forecasting) --> preprocessingData # step 1 : remove outlier (from EDA or domain Knowledge) preprocessing_Data = rawData # step 2 : Feature scaling # For classification (ex. Image, Natural language) --> preprocessingData preprocessing_data = rawData # X (Independent variable) & Y (Dependent variable) Split independent_var = preprocessing_data.loc[:, [ "Temperature", "Humidity", "Light", "CO2", "HumidityRatio" ]] dependent_var = preprocessing_data.loc[:, ["Occupancy"]] # Train & Test Split (독립변수, 의존변수, Shuffle 유무, Test Set 사이즈) x_train, x_test, y_train, y_test = preprocessing.train_test_split( independent_var, dependent_var, True, 0.2) logger.info("preprocessing end..") ################################################################## # Third : Build Model logger.info("build Model start..") import logistic_regression import randomforest import gradientboosting x_train = x_train.values y_train = y_train.values.ravel() logistic_model = logistic_regression.regression(x_train, y_train) randomforest_model = randomforest.randomforest(x_train, y_train, 20, 0) xgboost_model = gradientboosting.xgb(x_train, y_train, 0.02, 20) logger.info("build Model end..") ################################################################## # Fourth : Test & Tuning Model logger.info("test start..") from sklearn import metrics y_pred_logistic_model = logistic_model.predict(x_test) print('logistic_model 정확도 :', metrics.accuracy_score(y_test, y_pred_logistic_model)) y_pred_randomforest_model = randomforest_model.predict(x_test) print('randomforest_model 정확도 :', metrics.accuracy_score(y_test, y_pred_randomforest_model)) y_pred_xgboost_model = xgboost_model.predict(x_test) print('xgboost_model 정확도 :', metrics.accuracy_score(y_test.values, y_pred_xgboost_model)) confusion = metrics.confusion_matrix(y_test.values, y_pred_xgboost_model) print(confusion) logger.info("test end..") ################################################################## # Fifth : clear memory & Save Output logger.info("save start..") import joblib joblib.dump(logistic_model, model_output_adrress + "./logistic_model.pkl") logger.info("save end..") logger.info("Program End..")