コード例 #1
0
def output_predictions(path,start,end,values):
    keys=[]
    df=read_csv(path)
    #start,end=test[0],test[1]
    criterion=df['Season'].map(lambda x:x in range(start,end+1))
    df2=df[criterion]
    df2=df2.reset_index(drop=True)
    for index,row in df2.iterrows():
        keys.append((row['Home/Neutral'],row['Visitor/Neutral'],row['Season']))
    output=dict(zip(keys,values))
    return output
コード例 #2
0
    # print "standard deviations of R values in the heart rate: ", standard_deviation

    #Calculating the square root of the mean of list of squared differences
    heart_measure["rmssd"] = np.sqrt(np.mean(heart_measure["RR_sqdiff"]))

    #Create a list of all values with RR_diff over 20 and 50
    nn20 = [x for x in heart_measure["RR_diff"] if (heart_measure["RR_diff"]>20)]
    nn50 = [x for x in heart_measure["RR_diff"] if (heart_measure["RR_diff"]>50)]
    heart_measure["pnn20"] = float(len(nn20)) / float(len(heart_measure["RR_diff"]))  # Calculate the proportion of NN20, NN50 intervals to all intervals
    heart_measure["pnn50"] = float(len(nn50)) / float(len(heart_measure["RR_diff"]))  # Note the use of float(), because we don't want Python to think we want an int() and round the proportion to 0 or 1
    # print "pNN20, pNN50:", pnn20, pnn50
    return heart_measure

if __name__ == "__main__":
    data = read_csv("data.csv")
    # plot_data(data, "Heart Rate Signal")
    frequency = 100  # This dataset has a given frequency of 100Hz
    window_size = 0.75  # one sided window size as a proportion of the sampling frequency
    data_new, heart_measures = detect_peak(data, frequency, window_size)
    heart_measures = calc_heart_rate(heart_measures, frequency)
    # print "bpm is: %0.01f" % bpm

    plt.title("Detected Peaks in Heart Rate Signal")
    plt.xlim(0, 2500)
    plt.plot(data.hart, alpha=0.5,
             color="blue",
             label="raw signal")  # aplha sets the transparency level
    plt.plot(heart_measures["moving_average"],
             color="black",
             ls="-.",
コード例 #3
0
def main(*args):

    import pandas as pd
    import logging
    import numpy as np

    ##################################################################

    # preparations : arguments 변수 정의 및 logging 설정
    # logging level : DEBUG, INFO, WARNING, ERROR, CRITICAL

    data_adrress = args[0]
    log_adrress = args[1]
    model_output_adrress = args[2]

    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger("log")
    logger.setLevel(logging.INFO)
    file_handler = logging.FileHandler(log_adrress + "/log.log")
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    logging.basicConfig()

    logger.info("Program Start..")

    ##################################################################

    # Frist : Read data from the directory -> rawData

    import read_data

    logger.info("read data start..")

    rawData = read_data.read_csv(data_adrress)

    # 콘솔창 출력 column 확대
    pd.set_option('display.max_columns', None)
    # pd.set_option('display.max_rows', None)

    #print(rawData_test)

    logger.info("read data end..")

    ##################################################################

    # Second : Preprocess data from rawData to Train & Test set
    # Step : 1.remove Outlier 2.Feature Scaling 3.Test/Train Split & Shuffle

    import preprocessing

    logger.info("preprocessing start..")

    # Proprocess rawData according to data characteristics

    # step 1 : remove outlier & feature scaling (from EDA or domain Knowledge)

    # X (Independent variable) & Y (Dependent variable) Split

    preprocessing_Data_X, preprocessing_Data_Y, x_test = preprocessing.preprocessing(
        rawData)

    x_train = preprocessing_Data_X
    y_train = preprocessing_Data_Y

    # print(x_train.shape, y_train.shape)

    logger.info("preprocessing end..")

    ##################################################################

    # Third : Build Model

    from sklearn.model_selection import KFold

    logger.info("build Model start..")

    import knn
    import logistic_regression
    import randomforest
    import gradientboosting
    from sklearn.model_selection import KFold

    num_folds = 5
    num_instance = len(y_train)
    k_fold = KFold(n_splits=num_folds, shuffle=True)

    KFold(num_folds)
    x_train = x_train.values
    y_train = y_train.values.ravel()

    knn_model = knn.KNN_clf(10, x_train, y_train)
    logistic_model = logistic_regression.regression(x_train, y_train)
    randomforest_model = randomforest.randomforest(x_train, y_train, 20, 0)
    xgboost_model = gradientboosting.xgb(x_train, y_train, 0.01, 100)

    logger.info("build Model end..")

    ##################################################################

    # Fourth : Test & Tuning Model

    logger.info("test start..")

    from sklearn import metrics
    from sklearn.model_selection import cross_val_score

    y_pred_knn_model = knn_model.predict(x_train)
    print('knn_model 정확도 :', metrics.accuracy_score(y_train, y_pred_knn_model))
    print(
        'knn_model 정확도 (K-Fold) :',
        np.mean(
            cross_val_score(knn_model,
                            x_train,
                            y_pred_knn_model,
                            cv=k_fold,
                            scoring='neg_log_loss')))

    y_pred_logistic_model = logistic_model.predict(x_train)
    print('logistic_model 정확도 :',
          metrics.accuracy_score(y_train, y_pred_logistic_model))
    print(
        'logistic_model 정확도 (K-Fold) :',
        np.mean(
            cross_val_score(logistic_model,
                            x_train,
                            y_pred_logistic_model,
                            cv=k_fold,
                            scoring='neg_log_loss')))

    y_pred_randomforest_model = randomforest_model.predict(x_train)
    print('randomforest_model 정확도 :',
          metrics.accuracy_score(y_train, y_pred_randomforest_model))
    print(
        'randomforest_model 정확도 (K-Fold) :',
        np.mean(
            cross_val_score(randomforest_model,
                            x_train,
                            y_pred_randomforest_model,
                            cv=k_fold,
                            scoring='neg_log_loss')))

    y_pred_xgboost_model = xgboost_model.predict(x_train)
    print(' xgboost_model 정확도 :',
          metrics.accuracy_score(y_train, y_pred_xgboost_model))
    print(
        ' xgboost_model 정확도 (K-Fold) :',
        np.mean(
            cross_val_score(xgboost_model,
                            x_train,
                            y_pred_xgboost_model,
                            cv=k_fold,
                            scoring='neg_log_loss')))

    y_pred_xgboost_model_final = xgboost_model.predict(x_test)

    logger.info("test end..")

    ##################################################################

    # Fifth : clear memory & Save Output

    logger.info("save start..")

    import joblib
    joblib.dump(xgboost_model, model_output_adrress + "./xgboost_model.pkl")

    logger.info("save end..")
    logger.info("Program End..")
コード例 #4
0
import seaborn as sns
import scipy as stats
from matplotlib import rc
import missingno as mano

plt.style.use("ggplot")
mpl.rcParams["axes.unicode_minus"] = False

# data load

import read_data

# 콘솔창 출력 column 확대
pd.set_option('display.max_columns', None)

train = read_data.read_csv(
    "C:/Users/whcl3/PycharmProjects/DataScience/Kaggle/titanic/train.csv")
test = read_data.read_csv(
    "C:/Users/whcl3/PycharmProjects/DataScience/Kaggle/titanic//test.csv")

# 2번째 column Name -> Title

train_test_data = [train, test]

for dataset in train_test_data:
    dataset["Title"] = dataset['Name'].str.extract('([A-Za-z]+)\.',
                                                   expand=False)

# 불필요한 Column 제거

# axis = 1 이면 column 삭제, 0이면 Row 삭제
test.drop('Name', axis=1, inplace=True)
コード例 #5
0
    ]
    nn50 = [
        x for x in heart_measure["RR_diff"] if (heart_measure["RR_diff"] > 50)
    ]
    heart_measure["pnn20"] = float(len(nn20)) / float(
        len(heart_measure["RR_diff"])
    )  # Calculate the proportion of NN20, NN50 intervals to all intervals
    heart_measure["pnn50"] = float(len(nn50)) / float(
        len(heart_measure["RR_diff"])
    )  # Note the use of float(), because we don't want Python to think we want an int() and round the proportion to 0 or 1
    # print "pNN20, pNN50:", pnn20, pnn50
    return heart_measure


if __name__ == "__main__":
    data = read_csv("data.csv")
    # plot_data(data, "Heart Rate Signal")
    frequency = 100  # This dataset has a given frequency of 100Hz
    window_size = 0.75  # one sided window size as a proportion of the sampling frequency
    data_new, heart_measures = detect_peak(data, frequency, window_size)
    heart_measures = calc_heart_rate(heart_measures, frequency)
    # print "bpm is: %0.01f" % bpm

    plt.title("Detected Peaks in Heart Rate Signal")
    plt.xlim(0, 2500)
    plt.plot(data.hart, alpha=0.5, color="blue",
             label="raw signal")  # aplha sets the transparency level
    plt.plot(heart_measures["moving_average"],
             color="black",
             ls="-.",
             label="moving average")
コード例 #6
0
ファイル: main.py プロジェクト: zyq11223/TalkingData
train_df, test_df, numerical_patterns, cat_patterns = read_data_ph1()
predictors = numerical_patterns + cat_patterns
categorical = cat_patterns

is_val = (train_df['day'] == 9) & ((train_df['hour'] == 13) |
                                   (train_df['hour'] == 17) |
                                   (train_df['hour'] == 21))
val_df = train_df[is_val]
train_df = train_df[~is_val]

auc = model_lib.Predict(train_df,
                        val_df,
                        test_df,
                        predictors,
                        categorical,
                        seed=get_opt('seed', 2018))
print('validation auc:', auc)

test_df = test_df[['pred']].rename(columns={'pred': 'is_attributed'})
mapping = read_csv('../input/mapping.csv')
click_id = read_csv('../input/sample_submission.csv', usecols=['click_id'])
test_df = test_df.reset_index().merge(mapping,
                                      left_on='index',
                                      right_on='old_click_id',
                                      how='left')
test_df = click_id.merge(test_df, on='click_id', how='left')
outfile = '../csv/pred_test_' + target + '.csv'
print('writing to', outfile)
test_df[['click_id', 'is_attributed']].to_csv(outfile, index=False)
コード例 #7
0
def main(*args):

    import pandas as pd
    import logging

    ##################################################################

    # preparations : arguments 변수 정의 및 logging 설정
    # logging level : DEBUG, INFO, WARNING, ERROR, CRITICAL

    data_adrress = args[0]
    log_adrress = args[1]
    model_output_adrress = args[2]

    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger("log")
    logger.setLevel(logging.INFO)
    file_handler = logging.FileHandler(log_adrress + "/log.log")
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    logging.basicConfig()

    logger.info("Program Start..")

    ##################################################################

    # Frist : Read data from the directory -> rawData

    import read_data

    logger.info("read data start..")

    rawData = read_data.read_csv(data_adrress)

    # 콘솔창 출력 column 확대
    pd.set_option('display.max_columns', None)
    # pd.set_option('display.max_rows', None)

    #print(rawData)

    logger.info("read data end..")

    ##################################################################

    # Second : Preprocess data from rawData to Train & Test set
    # Step : 1.remove Outlier 2.Feature Scaling 3.Test/Train Split & Shuffle

    import preprocessing

    logger.info("preprocessing start..")

    # Proprocess rawData according to data characteristics

    # For regression (ex. Demand/Time forecasting) --> preprocessingData

    # step 1 : remove outlier (from EDA or domain Knowledge)
    preprocessing_Data = rawData

    # step 2 : Feature scaling

    # For classification (ex. Image, Natural language) --> preprocessingData

    preprocessing_data = rawData

    # X (Independent variable) & Y (Dependent variable) Split

    independent_var = preprocessing_data.loc[:, [
        "Temperature", "Humidity", "Light", "CO2", "HumidityRatio"
    ]]
    dependent_var = preprocessing_data.loc[:, ["Occupancy"]]

    # Train & Test Split (독립변수, 의존변수, Shuffle 유무, Test Set 사이즈)

    x_train, x_test, y_train, y_test = preprocessing.train_test_split(
        independent_var, dependent_var, True, 0.2)

    logger.info("preprocessing end..")

    ##################################################################

    # Third : Build Model

    logger.info("build Model start..")

    import logistic_regression
    import randomforest
    import gradientboosting

    x_train = x_train.values
    y_train = y_train.values.ravel()

    logistic_model = logistic_regression.regression(x_train, y_train)
    randomforest_model = randomforest.randomforest(x_train, y_train, 20, 0)
    xgboost_model = gradientboosting.xgb(x_train, y_train, 0.02, 20)

    logger.info("build Model end..")

    ##################################################################

    # Fourth : Test & Tuning Model

    logger.info("test start..")

    from sklearn import metrics

    y_pred_logistic_model = logistic_model.predict(x_test)
    print('logistic_model 정확도 :',
          metrics.accuracy_score(y_test, y_pred_logistic_model))

    y_pred_randomforest_model = randomforest_model.predict(x_test)
    print('randomforest_model 정확도 :',
          metrics.accuracy_score(y_test, y_pred_randomforest_model))

    y_pred_xgboost_model = xgboost_model.predict(x_test)
    print('xgboost_model 정확도 :',
          metrics.accuracy_score(y_test.values, y_pred_xgboost_model))

    confusion = metrics.confusion_matrix(y_test.values, y_pred_xgboost_model)
    print(confusion)

    logger.info("test end..")

    ##################################################################

    # Fifth : clear memory & Save Output

    logger.info("save start..")

    import joblib
    joblib.dump(logistic_model, model_output_adrress + "./logistic_model.pkl")

    logger.info("save end..")
    logger.info("Program End..")