Ejemplo n.º 1
0
def grid_search_analysis(filename):
    
    ##
    logger = logging.getLogger(__name__)
    
    ##
    logger.debug('Read and prepare cross validation score data')
    grid_search_data = pandas.read_csv(os.path.join(dt.output_dir(), filename), header=None, index_col=None)
    grid_search_data = grid_search_data.iloc[:, :-1]
    grid_search_data.columns = ['learning_rate', 'n_estimators', 'score']
    grid_search_data.loc[:, 'score'] = grid_search_data.loc[:, 'score'].apply(lambda x: float(*((re.findall(r'[0.]\d+', str(x))))))
    grid_search_data.loc[:, 'learning_rate'] = grid_search_data.loc[:, 'learning_rate'].apply(lambda x: float(*((re.findall(r'[0.]\d+', str(x))))))
    grid_search_data.loc[:, 'n_estimators'] = grid_search_data.loc[:, 'n_estimators'].apply(lambda x: int(*((re.findall(r'\d+', str(x))))))

    ##
    logger.debug('Return table sorted by score')
    result = grid_search_data.sort_values(by='score')
    print(result)

    ##
    logger.debug('Analytics')
    print(grid_search_data.groupby(('n_estimators', 'learning_rate')).mean().sort_values('score'))


    return result
Ejemplo n.º 2
0
def train_ada_boost_classifier(x_train, y_train, x_test, y_test, max_depth, class_weight, 
                               n_estimators, learning_rate_lower, learning_rate_upper, 
                               learning_rate_num, criterion, machines, comment='AdaBoostClassifier'):

    logger = logging.getLogger(__name__)
    rs = numpy.random.RandomState(12357)

    ##
    logger.info('<--Spec model parameters-->')
    learning_rate = numpy.logspace(learning_rate_lower, learning_rate_upper, learning_rate_num)
    model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth, criterion=criterion, class_weight=class_weight, random_state=rs))
    param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)
    kfold = StratifiedKFold(n_splits=4, random_state=rs)
    score = make_scorer(f1_score, average='micro')
    
    ##
    logger.info('<--Start grid search over n_estimators and learning_rate-->')
    grid_search = GridSearchCV(model, param_grid, scoring=score, n_jobs=machines, cv=kfold, verbose=3)
    opt_model = grid_search.fit(x_train, y_train.values.flatten())
    logger.info("Best score: [{:f}] using [{}]".format(opt_model.best_score_, opt_model.best_params_))

    ##
    logger.info('<--Make prediction and write out-->')
    prediction = opt_model.predict(x_test)
    prediction_score = f1_score(prediction, y_test.values.flatten(), average='micro')
    logger.info('Check prediction score on validation set := [{:f}]'.format(prediction_score))

    output = pandas.Series(prediction, name='y')
    output.to_csv(os.path.join(dt.output_dir(), 'ABC_{:s}.csv'.format(comment)), index=True, header=['y'], index_label=['id'])

    ##
    from sklearn.metrics import confusion_matrix as confusion_matrix
    print(confusion_matrix(prediction, y_test.values.flatten(),labels=[0,1,2]))

    return prediction, opt_model
Ejemplo n.º 3
0
def train(eeg1, eeg2, emg, labels, validate_size, epochs, label, type):

    eeg1_ = df_row_norm(eeg1.fillna(0)).values
    eeg2_ = df_row_norm(eeg2.fillna(0)).values
    emg_ = df_row_norm(emg.fillna(0)).values
    labels_ = (labels - 1).values

    X = np.dstack((eeg1_, eeg2_, emg_))
    Y = labels_

    x_validate = X[-validate_size:, :].copy()
    y_validate = Y[-validate_size:, :].copy()

    x_train = X[:-validate_size, :]
    y_train = Y[:-validate_size, :]

    model_path = os.path.join(dt.output_dir(), "%s.h5" % label)
    model = train_model(x_train,
                        y_train,
                        x_validate,
                        y_validate,
                        epochs=epochs,
                        type=type,
                        model_path=model_path)

    return model
Ejemplo n.º 4
0
    def run(self):
        with open(os.path.join(dt.output_dir(), 'grid_diag'), 'a') as f:
            while True:
                (cexp,gexp) = self.job_queue.get()
                if cexp is WorkerStopToken:
                    self.job_queue.put((cexp,gexp))
                    # print('worker {0} stop.'.format(self.name))
                    break
                try:
                    c, g = None, None
                    if cexp != None:
                        c = 2.0**cexp
                    if gexp != None:
                        g = 2.0**gexp
                    rate = self.run_one(c,g, f)
                    if rate is None: raise RuntimeError('get no rate')
                except:
                    # we failed, let others do that and we just quit

                    traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])

                    self.job_queue.put((cexp,gexp))
                    sys.stderr.write('worker {0} quit.\n'.format(self.name))
                    break
                else:
                    self.result_queue.put((self.name,cexp,gexp,rate))
Ejemplo n.º 5
0
def predict(X_test_A, X_test_B, model, type, weights, label):

    num_samples = X_test_A.shape[0]
    assert X_test_A.shape[0] == num_samples

    if type == ModelType.CNN_LSTM:
        assert len(model.input_shape) == 4
        time_steps = TIME_STEPS
        assert model.input_shape[1] == time_steps

        assert num_samples % time_steps == 0, "total number of samples must divide by number of time steps"

        X_test_A = X_test_A.reshape(
            (int(num_samples / time_steps), time_steps, *X_test_A.shape[1:]))
        X_test_B = X_test_B.reshape(
            (int(num_samples / time_steps), time_steps, *X_test_B.shape[1:]))

    y_score_A = model.predict(X_test_A).reshape(
        num_samples, model.output_shape[-1]) * weights
    y_score_B = model.predict(X_test_B).reshape(
        num_samples, model.output_shape[-1]) * weights

    y_score = np.concatenate((y_score_A, y_score_B))
    y_test = np.argmax(y_score, axis=1)

    result = pd.Series(y_test)

    expected = [0.526, 0.418, 0.0548]
    for i in range(3):
        print("class expected/realized class ratio [%s]: [%s/%s]" %
              (i, expected[i], sum(result == i) / len(result)))
    print("")

    result += 1
    result.index.name = 'Id'
    result.name = 'y'
    pd.DataFrame(result).to_csv(os.path.join(dt.output_dir(),
                                             "%s.csv" % label))

    return y_score
Ejemplo n.º 6
0
def main():

    test, train = read_data()

    ##
    yCols = ['y']
    xCols = list(set(train.columns).difference(yCols))

    assert set(yCols).intersection(xCols) == set(),\
     "there is a non-trivial intersection between yCols {} and xCols {}".format(" ".join(yCols), " ".join(xCols))

    ##
    betas = pandas.Series(dr.ridge_regression(X=train[xCols],
                                              y=train[yCols],
                                              lambdaParam=0).flatten(),
                          index=xCols)

    ##
    predictY = pandas.Series(test[xCols].dot(betas),
                             index=numpy.arange(10000, 12000))
    predictY.to_csv(os.path.join(dt.output_dir(), 'task0_solution.csv'),
                    index=True,
                    header=yCols,
                    index_label=['Id'])
Ejemplo n.º 7
0
-log2g {begin,end,step | "null"} : set the range of g (default 3,-15,-2)
    begin,end,step -- g_range = 2^{begin,...,begin+k*step,...,end}
    "null"         -- do not grid with g
-v n : n-fold cross validation (default 5)
-svmtrain pathname : set svm executable path and name
-gnuplot {pathname | "null"} :
    pathname -- set gnuplot executable path and name
    "null"   -- do not plot
-out {pathname | "null"} : (default dataset.out)
    pathname -- set output file path and name
    "null"   -- do not output file
-png pathname : set graphic output file path and name (default dataset.png)
-resume [pathname] : resume the grid task using an existing output file (default pathname is dataset.out)
    This is experimental. Try this option only if some parameters have been checked for the SAME data.

svm_options : additional options for svm-train""")
        sys.exit(1)

    if len(sys.argv) < 2:
        exit_with_help()
    dataset_pathname = sys.argv[-1]
    options = sys.argv[1:-1]
    try:
        with open(os.path.join(dt.output_dir(), 'grid_diag'), 'a') as f:
            f.write("START\n")
        find_parameters(dataset_pathname, options)
    except (IOError,ValueError) as e:
        sys.stderr.write(str(e) + '\n')
        sys.stderr.write('Try "grid.py" for more information.\n')
        sys.exit(1)
Ejemplo n.º 8
0
def train_svm_classifier(x_train,
                         y_train,
                         x_test,
                         y_test,
                         machines,
                         c_penalty_lower,
                         c_penalty_upper,
                         c_penalty_num,
                         g_lower,
                         g_upper,
                         g_num,
                         class_weight,
                         kernel,
                         comment='SMVC'):

    ##
    logger = logging.getLogger(__name__)
    rs = numpy.random.RandomState(12357)

    ##
    support_vector_machine_classifier = SVC(gamma='scale',
                                            kernel=kernel,
                                            class_weight=class_weight)

    ##
    logger.info('<--Spec model parameters-->')
    if (g_lower == None) & (g_upper == None) & (g_num == None):
        gamma = 'scale'
        c_penalty = numpy.logspace(c_penalty_lower, c_penalty_upper,
                                   c_penalty_num)
        param_grid = dict(C=c_penalty)
    else:
        c_penalty = numpy.logspace(c_penalty_lower, c_penalty_upper,
                                   c_penalty_num)
        gamma = numpy.logspace(g_lower, g_upper, g_num)
        param_grid = dict(C=c_penalty, gamma=gamma)

    kfold = StratifiedKFold(n_splits=3, random_state=rs)
    score = make_scorer(f1_score, average='micro')

    ##
    logger.info('<--Start grid search over C and gamma-->')
    grid_search = GridSearchCV(support_vector_machine_classifier,
                               param_grid,
                               scoring=score,
                               n_jobs=machines,
                               cv=kfold,
                               verbose=3)
    opt_model = grid_search.fit(x_train, y_train.values.flatten())
    logger.info("Best score: [{:f}] using [{}]".format(opt_model.best_score_,
                                                       opt_model.best_params_))

    ##
    logger.info('<--Make prediction and write out-->')
    prediction = opt_model.predict(x_test)
    prediction_score = f1_score(prediction,
                                y_test.values.flatten(),
                                average='micro')
    logger.info('Check prediction score on validation set := [{:f}]'.format(
        prediction_score))

    output = pandas.Series(prediction, name='y')
    output.to_csv(os.path.join(dt.output_dir(),
                               'SVM_{:s}.csv'.format(comment)),
                  index=True,
                  header=['y'],
                  index_label=['id'])
    print(
        confusion_matrix(prediction,
                         y_test.values.flatten(),
                         labels=list(range(len(set(y_test.y.values))))))

    return prediction, opt_model
Ejemplo n.º 9
0
import os
import pandas as pd

import data as dt
import matplotlib.pyplot as plt

grid_search = pd.read_csv(os.path.join(dt.output_dir(),
                                       'svm_train.scale_grid_search'),
                          header=None,
                          index_col=None)

x = grid_search.iloc[:, 0]
y = grid_search.iloc[:, 1]
z = grid_search.iloc[:, 2]

fig, ax = plt.subplots()
ax.scatter(x, y, c=z)
plt.show()

print("DONE")
Ejemplo n.º 10
0
def regression(seed, start, end, step, cv=3, comment=''):

    logger = logging.getLogger(__name__)

    ##
    logger.info('read provided data')
    X_test, X_train, y_train = read_data()
    std_train, std_test, = transform_data(X_train=X_train, X_test=X_test)

    ##
    removed = 0
    for col in std_train.columns:
        data = std_train[col].copy()
        mask = numpy.abs(data) > data.mean() + 3.5 * data.std()
        std_train.loc[mask, col] = numpy.NaN
        removed += sum(mask)
        del data, mask
    logger.info('removed a total of [{}] elements'.format(removed))

    ##
    if True:
        logger.info(
            'fill NaN with 0 i.e. the mean of the standardized random variables'
        )
        std_train.fillna(1e-3, inplace=True)
        std_test.fillna(1e-3, inplace=True)

    elif False:
        logger.info('fill NaN with linear regression model of X_i = f(y)')

        std_train = clean_data(predictors=std_train_temp,
                               response=y_train,
                               clean_mode=CLEAN_MODE.RESPONSE)

        std_test.fillna(0.0, inplace=True)
        std_test = std_test.reindex(columns=choose)
        del choose

    ##
    logger.info('feature engineering')
    base_columns = std_train.copy().columns
    base_train = std_train.copy()
    base_test = std_test.copy()

    names = base_columns + '_sq'
    train_sq = base_train.pow(2)
    train_sq.columns = names
    std_train = pandas.concat([std_train, train_sq], axis=1)

    test_sq = base_test.pow(2)
    test_sq.columns = names
    std_test = pandas.concat([std_test, test_sq], axis=1)

    names = base_columns + '_sin'
    train_sq = numpy.sin(base_train)
    train_sq.columns = names
    std_train = pandas.concat([std_train, train_sq], axis=1)

    test_sq = numpy.sin(base_test)
    test_sq.columns = names
    std_test = pandas.concat([std_test, test_sq], axis=1)

    ##
    logger.info('use lasso regression with custom set of lambda parameters')
    alphas = seed**numpy.arange(start, end, step)
    logger.info('alpha parameters := {}'.format(
        str(["{0:0.2f}".format(i) for i in alphas]).replace("'", "")))
    reg = LassoCV(alphas=alphas, cv=cv, n_jobs=2, random_state=12357)
    model_cv = reg.fit(std_train.values, y_train.values.flatten())
    logger.info('alpha := {:f}'.format(float(model_cv.alpha_)))
    pred = model_cv.predict(std_test)
    resid = y_train.values.flatten() - model_cv.predict(std_train)

    ##
    logger.info('plotting of first stage results')
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(17, 10))
    f.suptitle('first stage')
    ax1.plot(resid, 'bo')
    tau = numpy.mean(resid) + 1.64 * numpy.std(resid)
    mask = numpy.abs(resid) > tau
    ax1.plot([i if numpy.abs(i) > tau else None for i in resid], 'ro')
    ax1.set_title('Residuals')
    ax2.scatter(model_cv.predict(std_train), y_train)
    x0, x1 = ax2.get_xlim()
    y0, y1 = ax2.get_ylim()
    ax2.set_aspect((x1 - x0) / (y1 - y0))
    ax2.set_title('Fitted vs. Actual')

    ##
    logger.info(
        'use second lasso regression, removing large error inducing observations'
    )
    std_train_ = std_train[~mask]
    y_train_ = y_train[~mask]
    reg = LassoCV(alphas=alphas, cv=cv, n_jobs=2, random_state=12357)
    model_cv = reg.fit(std_train_.values, y_train_.values.flatten())
    logger.info('alpha := {:f}'.format(float(model_cv.alpha_)))
    pred = model_cv.predict(std_test)
    resid = y_train_.values.flatten() - model_cv.predict(std_train_)

    ##
    logger.info('plotting of second stage results')
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(17, 10))
    f.suptitle('second stage')
    ax1.plot(resid, 'bo')
    tau = numpy.mean(resid) + 1.6 * numpy.std(resid)
    mask = numpy.abs(resid) > tau
    ax1.plot([i if numpy.abs(i) > tau else None for i in resid], 'ro')
    ax1.set_title('Residuals')
    ax2.scatter(model_cv.predict(std_train), y_train)
    x0, x1 = ax2.get_xlim()
    y0, y1 = ax2.get_ylim()
    ax2.set_aspect((x1 - x0) / (y1 - y0))
    ax2.set_title('Fitted vs. Actual, RMSE := {:.6f}'.format(
        mean_squared_error(y_train, model_cv.predict(std_train))))

    ##
    logger.info('write to pandas Series object')
    write_to_file = pandas.Series(pred,
                                  index=X_test.index.astype(int),
                                  name='y')
    write_to_file.to_csv(os.path.join(dt.output_dir(),
                                      'task1_solution_{}.csv'.format(comment)),
                         index=True,
                         header=['y'],
                         index_label=['id'])
Ejemplo n.º 11
0
    grid_search = GridSearchCV(model,
                               param_grid,
                               scoring="balanced_accuracy",
                               n_jobs=4,
                               cv=kfold)
    opt_ada_boost_params = grid_search.fit(x_train, y_train.flatten())
    logger.info("Best: [{:f}] using [{}]".format(
        opt_ada_boost_params.best_score_, opt_ada_boost_params.best_params_))

    return opt_ada_boost_params


#######################################################################
if __name__ == '__main__':

    root = logging.getLogger(__name__)
    root.setLevel(logging.INFO)
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    root.addHandler(ch)

    ##
    user = '******'
    logger = logging.getLogger(__name__)
    opt_params = cluster_tester(n=100, k=10)
    pandas.Series([1, 2, 3, 4], index=[1, 2, 3, 4]).to_csv(
        os.path.join(dt.output_dir(), 'euler_cluster_test_out.csv'))
    logger.info('{} <Job Done>'.format(user))
Ejemplo n.º 12
0
def main():

    N = 21600
    validate_size = 2000
    epochs = 50
    type = ModelType.CNN_LSTM

    ###################################
    ### Read train data and fit models
    ###################################

    eeg1 = pd.read_csv(os.path.join(dt.data_dir(), 'task5', 'train_eeg1.csv'),
                       header=0,
                       index_col=0)
    eeg2 = pd.read_csv(os.path.join(dt.data_dir(), 'task5', 'train_eeg2.csv'),
                       header=0,
                       index_col=0)
    emg = pd.read_csv(os.path.join(dt.data_dir(), 'task5', 'train_emg.csv'),
                      header=0,
                      index_col=0)
    labels = pd.read_csv(os.path.join(dt.data_dir(), 'task5',
                                      'train_labels.csv'),
                         header=0,
                         index_col=0)

    ##########################
    ### subject one model
    start = 0
    end = N
    label = 'subject1_%s_%s_epochs' % (type, epochs)
    subject1_model = train(eeg1=eeg1.iloc[start:end, :],
                           eeg2=eeg2.iloc[start:end, :],
                           emg=emg.iloc[start:end, :],
                           labels=labels.iloc[start:end, :],
                           type=type,
                           validate_size=validate_size,
                           epochs=epochs,
                           label=label)

    ##########################
    ### subject two model
    start = N
    end = N * 2
    label = 'subject2_%s_%s_epochs' % (type, epochs)
    subject2_model = train(eeg1=eeg1.iloc[start:end, :],
                           eeg2=eeg2.iloc[start:end, :],
                           emg=emg.iloc[start:end, :],
                           labels=labels.iloc[start:end, :],
                           type=type,
                           validate_size=validate_size,
                           epochs=epochs,
                           label=label)

    ##########################
    ### subject three model
    start = N * 2
    end = N * 3 - 500
    label = 'subject3_%s_%s_epochs' % (type, epochs)
    subject3_model = train(eeg1=eeg1.iloc[start:end, :],
                           eeg2=eeg2.iloc[start:end, :],
                           emg=emg.iloc[start:end, :],
                           labels=labels.iloc[start:end, :],
                           type=type,
                           validate_size=validate_size,
                           epochs=epochs,
                           label=label)

    ##############################################
    ### Models fitted, read test data and predict
    ##############################################

    eeg1_test = pd.read_csv(os.path.join(dt.data_dir(), 'task5',
                                         'test_eeg1.csv'),
                            header=0,
                            index_col=0)
    eeg2_test = pd.read_csv(os.path.join(dt.data_dir(), 'task5',
                                         'test_eeg2.csv'),
                            header=0,
                            index_col=0)
    emg_test = pd.read_csv(os.path.join(dt.data_dir(), 'task5',
                                        'test_emg.csv'),
                           header=0,
                           index_col=0)
    eeg1_test_A = df_row_norm(eeg1_test.iloc[:N, :].fillna(0)).values
    eeg2_test_A = df_row_norm(eeg2_test.iloc[:N, :].fillna(0)).values
    emg_test_A = df_row_norm(emg_test.iloc[:N, :].fillna(0)).values

    eeg1_test_B = df_row_norm(eeg1_test.iloc[N:, :].fillna(0)).values
    eeg2_test_B = df_row_norm(eeg2_test.iloc[N:, :].fillna(0)).values
    emg_test_B = df_row_norm(emg_test.iloc[N:, :].fillna(0)).values

    X_test_A = np.dstack((eeg1_test_A, eeg2_test_A, emg_test_A))
    X_test_B = np.dstack((eeg1_test_B, eeg2_test_B, emg_test_B))

    #################################
    ### subject one model prediction
    label = 'subject_1_%s_weighted_%s_epochs' % (type, epochs)
    y_subject1_score = predict(X_test_A,
                               X_test_B,
                               model=subject1_model,
                               type=type,
                               weights=[1, 0.5, 2.5],
                               label=label)

    #################################
    ### subject two model prediction
    label = 'subject_2_%s_weighted_%s_epochs' % (type, epochs)
    y_subject2_score = predict(X_test_A,
                               X_test_B,
                               model=subject2_model,
                               type=type,
                               weights=[1, 0.5, 2.0],
                               label=label)

    ###################################
    ### subject three model prediction
    label = 'subject_3_%s_weighted_%s_epochs' % (type, epochs)
    y_subject3_score = predict(X_test_A,
                               X_test_B,
                               model=subject3_model,
                               type=type,
                               weights=[1, 0.5, 4.5],
                               label=label)

    ##################################
    ### all subjects model prediction
    label = 'all_subjects_%s_%s_epochs' % (type, epochs)
    y_score = y_subject1_score * 0.33 + y_subject2_score * 0.33 + y_subject3_score * 0.33
    y_test = np.argmax(y_score, axis=1)

    result = pd.Series(y_test)

    expected = [0.526, 0.418, 0.0548]
    for i in range(3):
        print("class expected/realized class ratio [%s]: [%s/%s]" %
              (i, expected[i], sum(result == i) / len(result)))
    print("")

    result += 1
    result.index.name = 'Id'
    result.name = 'y'
    pd.DataFrame(result).to_csv(os.path.join(dt.output_dir(),
                                             "%s.csv" % label))

    ##################################
    ### all subjects model prediction
    label = 'all_subjects_%s_weighted_%s_epochs' % (type, epochs)
    y_score = (y_subject1_score * 0.5 + y_subject2_score * 0 +
               y_subject3_score * 0.5) * [1.5, 0.8, 1.6]
    y_test = np.argmax(y_score, axis=1)

    result = pd.Series(y_test)

    expected = [0.526, 0.418, 0.0548]
    for i in range(3):
        print("class expected/realized class ratio [%s]: [%s/%s]" %
              (i, expected[i], sum(result == i) / len(result)))
    print("")

    result += 1
    result.index.name = 'Id'
    result.name = 'y'
    pd.DataFrame(result).to_csv(os.path.join(dt.output_dir(),
                                             "%s.csv" % label))

    print("DONE")
Ejemplo n.º 13
0
def analyze_all_tram_lines():

	import math

	from sklearn.preprocessing import OneHotEncoder
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import r2_score
	from sklearn.metrics import mean_squared_error

	logger = logging.getLogger(__name__)

	## 
	logger.info('<--Read bus delay data-->')
	zvv = pandas.read_hdf(os.path.join('data', 'zvv_all_tram_lines.h5'))

	##
	logger.info('<--Read weather data & adjust for outliers-->')
	weather = dt.get_iac_weather_data()

	q = 3 #weather.rain.quantile(0.99975)
	mask = weather.rain < q
	weather = weather[mask]
	del mask, q

	##
	logger.info('<--Pre-process tram delay data-->')
	zvv.loc[:, 'diff'] = zvv.ist_an_von - zvv.soll_an_von
	zvv.loc[:, 'time'] = pandas.to_datetime(zvv.soll_an_von.astype(float), errors='coerce', unit='s')
	zvv.time = zvv.time.dt.strftime('%H:%M')
	zvv.loc[:, 'datetime'] = pandas.to_datetime(zvv.datum_von.astype(str) + ' ' + zvv.time)
	zvv.datetime = zvv.datetime.dt.round('60min')

	##
	logger.info('<--Extract weather measures-->')
	weather.loc[:, 'datetime'] = weather.index.round('60min')

	resampleSumWeatherByHour = weather.resample('H').sum()
	resampleMeanWeatherByHour = weather.resample('H').mean()

	maskSnow = resampleMeanWeatherByHour.T_air < 0
	feature = resampleMeanWeatherByHour.rain * maskSnow.astype(int) 

	##
	logger.info('<--Compute de-seasoning for all tram lines-->')
	container = []
	for line in numpy.sort(numpy.setdiff1d(zvv.linie.unique(), [753, 29])):

		##
		logger.info('<--Compute groupby sum on datetime for all tram line %i -->' %line)
		transport = zvv[zvv.linie == line]
		transport.set_index('datetime', drop=True, inplace=True)
		transport.index = pandas.to_datetime(transport.index)
		transport = transport.groupby(transport.index).sum()

		timeDelta = datetime.timedelta(days=7)
		temp = transport['diff'].copy() - transport['diff'].shift(freq=timeDelta)
		weeklyDetrendedtram = temp.dropna(how='all', axis=0)
		weeklyDetrendedtram = weeklyDetrendedtram.interpolate()
		del timeDelta, temp

		##
		logger.info('<--Combine line %i features into new data-frame-->' %line)
		window = 6
		combine = [
			weeklyDetrendedtram.rolling(window=window).mean(),
			resampleSumWeatherByHour.rain.rolling(window=window).mean(),
			pandas.Series(feature.rolling(window=window).mean(), name='snow'),
			pandas.Series(resampleMeanWeatherByHour.T_air.rolling(window=window).mean(), name='temp')
		]

		df = pandas.concat(combine, axis=1).dropna(how='any')
		df.loc[:, 'weekday'] = df.index.dayofweek
		df.loc[:, 'hour'] = df.index.hour

		mask = (df['diff'] > 0)
		df = df[mask]
		corr = df.corr()

		logger.info('<--1. Categorical features -> one-hot encoder-->')
		data = df.sort_values(['weekday', 'hour'])
		encoder = OneHotEncoder()
		
		categoricalFeatures = [
			'weekday', 
			'hour'
		]

		encoderFeatureOrder = [
			*data.weekday.unique(),
			*data.hour.unique(),
		]

		enc = encoder.fit(data.loc[:, categoricalFeatures])
		categoricalData = enc.transform(data.loc[:, categoricalFeatures])

		logger.info('<--2. Ordinal features -> no transform -->')
		target = ['diff']
		data.drop(columns=categoricalFeatures)
		ordinalFeatures = data.columns.difference(target)


		logger.info('<--2. Regression-->')
		trainX = numpy.hstack([data.loc[:, ordinalFeatures].values, categoricalData.todense()])
		trainY = data.loc[:, target].values.flatten()
		reg = LinearRegression(fit_intercept=True)
		reg.fit(X=trainX, y=trainY)
		predict = reg.predict(X=trainX)

		logger.info('<--3. Results & Plot-->')
		a, b = numpy.polyfit(trainY, predict, deg=1)
		f = lambda x: a*x + b

		fig, ax = plt.subplots(1)
		ax.scatter(y=trainY, x=predict, color='red', marker='x')
		ax.plot(predict, f(predict))
		ax.set_aspect('equal')
		ax.grid(True)
		ax.set_ylabel('Actual - delay')
		ax.set_xlabel('Predicted - delay')
		ax.set_title('Linear Regression Model - Line %i' %line)

		r2 = r2_score(trainY, predict)
		mse = mean_squared_error(trainY, predict)

		corrDelayRain = mpatches.Patch(color='blue', label='R^2 %.4f' %r2)
		corrDelaySnow = mpatches.Patch(color='blue', label='RMSE %.4f' %math.sqrt(mse))
		plt.legend(handles=[corrDelayRain, corrDelaySnow])

		plt.savefig(os.path.join(dt.output_dir(), 'line_%i_prediction.png' %line))

		logger.info('<--4. Correlation structure-->')
		print(df.corr())

		logger.info('<--5. Save summary statistics to file-->')
		stats = pandas.Series(data=corr.loc['diff',:], name=line)
		stats['r2'] = r2
		stats['mse'] = mse
		container.append(stats)


	pandas.concat(container, axis=1).to_csv(os.path.join(dt.output_dir(), 'correlation_all_tram_lines.csv'))
Ejemplo n.º 14
0
	axis=0
	ax[axis].plot(yData.index, yData) 
	ax[axis].set_ylabel('Delay [s]')

	axis+=1
	ax[axis].bar(xData.index, height=xData, width=0.05, color='green')
	ax[axis].set_xlabel('YYYY-MM-DD:HH')
<<<<<<< HEAD
	ax[axis].set_ylabel('OTELFINGEN - CUM. HOURLY PRECIPITATION [mm]')
	plt.tight_layout()
=======
	ax[axis].set_ylabel('Precipitation [mm]')
>>>>>>> 75b31de49b39ce14bd7c974f3da31600ecd0ee94

	fig.savefig(os.path.join(dt.output_dir(), 'delay_vs_rainfall.png'))

	del mask, xData, yData

	'''
	Description:
		Scatter plot between AVERAGE TEMPRATURE and DELAYS
	'''
	xData = averageWeatherDelays.reindex(index=weeklySeasoned.index)['temp_degrees_c_mittel']
	yData = weeklySeasoned['diff']
	plt.figure()
	plt.scatter(x=xData, y=yData, marker='x')
	plt.xlabel('AVG TEMPERATURE [C]')
	plt.ylabel('DE-SEASONED DELAY [s]')
	plt.tight_layout()
Ejemplo n.º 15
0
    cmd = '{0} "{1}" "{2}" "{3}"'.format(svmpredict_exe, scaled_test_file,
                                         model_file, predict_test_file)
    print('Testing...')
    print("run [%s]" % cmd)
    Popen(cmd, shell=True).communicate()

    print('Output prediction: {0}'.format(predict_test_file))

    return predict_test_file


#######################################################################
if __name__ == "__main__":

    train_file = os.path.join(dt.output_dir(), 'svm_train')
    validate_file = os.path.join(dt.output_dir(), 'svm_validate')
    all_train_file = os.path.join(dt.output_dir(), 'svm_all_train')
    test_file = os.path.join(dt.output_dir(), 'svm_test')
    result_file = os.path.join(dt.output_dir(), 'svm_result')

    #nrows = 100
    nrows = 3030 + 443 + 1474 + 170

    y = pd.read_csv(os.path.join(dt.data_dir(), 'task3', 'y_train.csv'),
                    header=0,
                    index_col=0,
                    nrows=nrows)

    X_fft = pd.read_csv(os.path.join(dt.data_dir(), 'task3',
                                     'X_train_fft.csv'),