def main():
    # Read the dataset.
    dataset_path = './dataset/'
    dataset_file_path = './dataset_file_path.csv'
    df_path = pd.read_csv(dataset_file_path)
    all_file_param = read_dataset.read_all_dataset(df_path)
    file_amount = len(all_file_param)
    plot_files = ['Training set Microclimate (2 hour intervals)']
    yield_file = 'Target Variable Water Yield'
    # plot_files = ['Training set Microclimate (2 hour intervals)',
    #               'Training set Microclimate (5 minute intervals)']

    # Set up features for micro files.
    micro_features = read_dataset.set_features(dataset_path, plot_files)

    # Read yield file for micro training file.
    yield_df = None
    for k in xrange(file_amount):
        file = all_file_param[k]
        if file.data_name == yield_file:
            yield_path = dataset_path + file.file_path
            yield_df = pd.read_csv(yield_path)
            break

    # Traverse all the dataset.
    for k in xrange(file_amount):
        file = all_file_param[k]

        if file.data_name not in plot_files:
            continue
        print '==========' + file.data_name + '=========='
        path = dataset_path + file.file_path
        df = pd.read_csv(path)

        # Split the micro training file into training dataset and test dataset.
        X_train, X_test, y_train, y_test, train_data, test_data = split_and_build_class(
            df.values, yield_df.values)
        # [train_data, test_data] = split_and_build_class(df.values, yield_df.values)

        # Run Ridge Regression.
        clf = run_regression(X_train[:, 1:], y_train)
        y_hat_test = clf.predict(X_test[:, 1:])

        cmap = plt.get_cmap('jet_r')
        plt.figure(figsize=(10, 10))

        interval = file.interval
        intervel_minute = read_dataset.get_interval_minute(interval)

        test_size = y_hat_test.shape[0]
        plt.plot([i for i in xrange(test_size)], y_hat_test)
        plt.plot([i for i in xrange(test_size)], y_test)
        plt.legend(['Prediction', 'Real'])
        plt.suptitle('Cross validation + Bagging Regressor')
        plt.savefig('Cross validation + Bagging Regressor.png',
                    bbox_inches='tight')

        loss = np.sqrt(mean_squared_error(y_test, y_hat_test))
        print 'Cross validation + Bagging Regressor loss =', loss
def main():
    # Read the dataset.
    dataset_path = './dataset/'
    dataset_file_path = './dataset_file_path.csv'
    df_path = pd.read_csv(dataset_file_path)
    all_file_param = read_dataset.read_all_dataset(df_path)
    file_amount = len(all_file_param)
    plot_files = ['Training set Microclimate (2 hour intervals)']
    yield_file = 'Target Variable Water Yield'
    # plot_files = ['Training set Microclimate (2 hour intervals)',
    #               'Training set Microclimate (5 minute intervals)']

    # Set up features for micro files.
    micro_features = read_dataset.set_features(dataset_path, plot_files)

    # Read yield file for micro training file.
    yield_df = None
    for k in xrange(file_amount):
        file = all_file_param[k]
        if file.data_name == yield_file:
            yield_path = dataset_path + file.file_path
            yield_df = pd.read_csv(yield_path)
            break

    # Traverse all the dataset.
    for k in xrange(file_amount):
        file = all_file_param[k]

        if file.data_name not in plot_files:
            continue
        print '==========' + file.data_name + '=========='
        path = dataset_path + file.file_path
        df = pd.read_csv(path)

        # Split the micro training file into training dataset and test dataset.
        X_train, X_test, y_train, y_test, train_data, test_data = split_and_build_class(df.values, yield_df.values)
        # [train_data, test_data] = split_and_build_class(df.values, yield_df.values)

        # Run Ridge Regression.
        clf = run_regression(X_train[:, 1:], y_train)
        y_hat_test = clf.predict(X_test[:, 1:])

        cmap = plt.get_cmap('jet_r')
        plt.figure(figsize=(10, 10))

        interval = file.interval
        intervel_minute = read_dataset.get_interval_minute(interval)

        test_size = y_hat_test.shape[0]
        plt.plot([i for i in xrange(test_size)], y_hat_test)
        plt.plot([i for i in xrange(test_size)], y_test)
        plt.legend(['Prediction', 'Real'])
        plt.suptitle('Cross validation + Bagging Regressor')
        plt.savefig('Cross validation + Bagging Regressor.png', bbox_inches='tight')

        loss = np.sqrt(mean_squared_error(y_test, y_hat_test))
        print 'Cross validation + Bagging Regressor loss =', loss
def plot_one_feature(plot_files, all_file_param, dataset_path, feature):
    file_amount = len(all_file_param)

    # Initiate the plot.
    cmap = plt.get_cmap('jet_r')
    plt.figure(figsize=(25, 10))
    # plot_for_legend = plt.subplot()

    # Color setup for single file.
    color = cmap(float(9) / file_amount)

    # Traverse all the dataset.
    print '==========' + feature + '=========='
    for k in xrange(file_amount):
        file = all_file_param[k]

        # Color setup for multiple files.
        # color = cmap(float(k) / file_amount)

        # Use this block to set which dataset you want to find missing intervals.
        if file.data_name not in plot_files:
            continue

        print 'Ploting ' + file.data_name + '...'
        path = dataset_path + file.file_path
        df = pd.read_csv(path)
        all_data = read_dataset.microData()
        all_data.get_data(df.values)

        interval = file.interval
        intervel_minute = read_dataset.get_interval_minute(interval)
        prev_time = all_data.data_time[0]
        for i in xrange(1, len(all_data.data_time)):
            current_time = all_data.data_time[i]
            diff = current_time - prev_time
            if not diff.days and diff.seconds / 60 <= intervel_minute:
                plt.plot(
                    [prev_time, current_time],
                    all_data.read_feature(feature)[i - 1: i + 1],
                    c=color
                )
            prev_time = current_time

            if i % 1000 == 0:
                print 'Plotted ', i, ' lines...'

        print 'Plotted done!'

    plt.legend(plot_files)
    plt.suptitle(feature + ' in ' + str(plot_files))
    plt.savefig('./micro_features_plot/' + feature + '.png', bbox_inches='tight')
def plot_one_feature(plot_files, all_file_param, dataset_path, feature):
    file_amount = len(all_file_param)

    # Initiate the plot.
    cmap = plt.get_cmap('jet_r')
    plt.figure(figsize=(25, 10))
    # plot_for_legend = plt.subplot()

    # Color setup for single file.
    color = cmap(float(9) / file_amount)

    # Traverse all the dataset.
    print '==========' + feature + '=========='
    for k in xrange(file_amount):
        file = all_file_param[k]

        # Color setup for multiple files.
        # color = cmap(float(k) / file_amount)

        # Use this block to set which dataset you want to find missing intervals.
        if file.data_name not in plot_files:
            continue

        print 'Ploting ' + file.data_name + '...'
        path = dataset_path + file.file_path
        df = pd.read_csv(path)
        all_data = read_dataset.microData()
        all_data.get_data(df.values)

        interval = file.interval
        intervel_minute = read_dataset.get_interval_minute(interval)
        prev_time = all_data.data_time[0]
        for i in xrange(1, len(all_data.data_time)):
            current_time = all_data.data_time[i]
            diff = current_time - prev_time
            if not diff.days and diff.seconds / 60 <= intervel_minute:
                plt.plot([prev_time, current_time],
                         all_data.read_feature(feature)[i - 1:i + 1],
                         c=color)
            prev_time = current_time

            if i % 1000 == 0:
                print 'Plotted ', i, ' lines...'

        print 'Plotted done!'

    plt.legend(plot_files)
    plt.suptitle(feature + ' in ' + str(plot_files))
    plt.savefig('./micro_features_plot/' + feature + '.png',
                bbox_inches='tight')
Ejemplo n.º 5
0
def main():
    # Read the dataset.
    dataset_path = './dataset/'
    dataset_file_path = './dataset_file_path.csv'
    df_path = pd.read_csv(dataset_file_path)
    all_file_param = read_dataset.read_all_dataset(df_path)
    file_amount = len(all_file_param)
    plot_files = ['Training set Microclimate (2 hour intervals)']
    yield_file = 'Target Variable Water Yield'
    # plot_files = ['Training set Microclimate (2 hour intervals)',
    #               'Training set Microclimate (5 minute intervals)']

    # Set up features for micro files.
    micro_features = read_dataset.set_features(dataset_path, plot_files)

    # Read yield file for micro training file.
    yield_df = None
    for k in xrange(file_amount):
        file = all_file_param[k]
        if file.data_name == yield_file:
            yield_path = dataset_path + file.file_path
            yield_df = pd.read_csv(yield_path)
            break

    # Traverse all the dataset.
    for k in xrange(file_amount):
        file = all_file_param[k]

        if file.data_name not in plot_files:
            continue
        print '==========' + file.data_name + '=========='
        path = dataset_path + file.file_path
        df = pd.read_csv(path)

        # Split the micro training file into training dataset and test dataset.
        X_train, X_test, y_train, y_test, train_data, test_data = split_and_build_class(df.values, yield_df.values)
        # [train_data, test_data] = split_and_build_class(df.values, yield_df.values)

        # Run Ridge Regression.
        clf = run_regression(X_train[:, 1:], y_train)
        y_hat_test = clf.predict(X_test[:, 1:])

        cmap = plt.get_cmap('jet_r')
        plt.figure(figsize=(10, 10))

        interval = file.interval
        intervel_minute = read_dataset.get_interval_minute(interval)

        test_size = y_hat_test.shape[0]
        plt.plot([i for i in xrange(test_size)], y_hat_test)
        plt.plot([i for i in xrange(test_size)], y_test)
        plt.legend(['Prediction', 'Real'])
        plt.suptitle('Cross validation + Random Forest Regressor')
        plt.savefig('Cross validation + Random Forest Regressor.png', bbox_inches='tight')

        loss = np.sqrt(mean_squared_error(y_test, y_hat_test))
        print 'Cross validation + Random Forest Regressor loss =', loss

        '''
        =======================================================================
        '''

        # Predict test and write submission
        submission_file_name = 'Submission format'
        submission_file = None
        test_file_name = 'Test set Microclimate (2 hour intervals)'
        test_file = None

        for k in xrange(file_amount):
            file = all_file_param[k]
            if file.data_name == submission_file_name:
                submission_file = file
                break
        submission_path = dataset_path + submission_file.file_path
        df_submission = pd.read_csv(submission_path, index_col=0, parse_dates=[0])

        for k in xrange(file_amount):
            file = all_file_param[k]
            if file.data_name == test_file_name:
                test_file = file
                break
        test_path = dataset_path + test_file.file_path
        df_test = pd.read_csv(test_path, index_col=0, parse_dates=[0])

        X_combined = write_submission.combine_table(df_submission, df_test)
        imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
        fixed_X = X_combined.values[:, 0:]
        imp.fit(fixed_X)
        X_combined.values[:, 0:] = imp.transform(fixed_X)
        preprocessing.normalize(X_combined.values, copy=False)
        y_submission = write_submission.write_submission(
            X_combined, clf, df_submission,
            'Cross Valication + Random Forest Regressor Submission')
def main():
    # Read the dataset.
    dataset_path = './dataset/'
    dataset_file_path = './dataset_file_path.csv'
    df_path = pd.read_csv(dataset_file_path)
    all_file_param = read_dataset.read_all_dataset(df_path)
    file_amount = len(all_file_param)

    # Initiate the plot.
    cmap = plt.get_cmap('jet_r')
    plt.figure(figsize=(10, 10))
    plot_for_legend = plt.subplot()

    # Traverse all the dataset.
    for k in xrange(file_amount):
        file = all_file_param[k]

        # Use this block to set which dataset you want to find missing intervals.
        # if file.data_name != 'Macroclimate Guelmim Airport':
            # continue

        print '==========' + file.data_name + '=========='
        path = dataset_path + file.file_path
        df = pd.read_csv(path)
        data_time = df.values[:, 0]

        interval = file.interval
        start_time = datetime.datetime.strptime(file.start_time,'%H:%M')
        end_time = datetime.datetime.strptime(file.end_time,'%H:%M')
        missing_data = []

        prev_time = read_dataset.assign_time(data_time[0])
        current_time = None

        color = cmap(float(k) / file_amount)

        for i in range(1, data_time.shape[0]):
            tmp = data_time[i]
            current_time = read_dataset.assign_time(tmp)
            diff = current_time - prev_time

            # Find out missing intervals with gap larger that the default interval.
            if diff.days or diff.seconds / 60 > read_dataset.get_interval_minute(interval):
                missing_interval = [prev_time, current_time]
                missing_data += fix_interval(start_time, end_time, missing_interval)
            prev_time = current_time

        # Output all the missing intervals.
        for item in missing_data:
            print item[0].strftime('%Y-%m-%d %H:%M:%S'), \
                item[1].strftime('%Y-%m-%d %H:%M:%S')
        print len(missing_data), ' missing intervals are found.'

        # Plot the missing intervals.
        for item in missing_data:
            plt.plot(item, [(k + 1) for j in xrange(2)], c=color)
        plt.ylim([0, file_amount + 1])
        plot_for_legend.plot([], [], c=color, label=file.data_name)

    # Set the position, legend, and subtitle of the plot.
    box = plot_for_legend.get_position()
    plot_for_legend.set_position([box.x0, box.y0, box.width, box.height * 0.6])
    plot_for_legend.set_position([box.x0, box.y0 + box.height * 0.2,
                                  box.width, box.height * 0.8])
    legend = plot_for_legend.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
                                    fancybox=True, shadow=True, ncol=2)
    plt.suptitle('Missing Intervals for the Dataset')
    plt.savefig('missing_intervals_for_the_dataset.png')
def main():
    # Read the dataset.
    dataset_path = './dataset/'
    dataset_file_path = './dataset_file_path.csv'
    df_path = pd.read_csv(dataset_file_path)
    all_file_param = read_dataset.read_all_dataset(df_path)
    file_amount = len(all_file_param)
    plot_files = ['Training set Microclimate (2 hour intervals)']
    yield_file = 'Target Variable Water Yield'
    # plot_files = ['Training set Microclimate (2 hour intervals)',
    #               'Training set Microclimate (5 minute intervals)']

    # Set up features for micro files.
    micro_features = read_dataset.set_features(dataset_path, plot_files)

    # Read yield file for micro training file.
    yield_df = None
    for k in xrange(file_amount):
        file = all_file_param[k]
        if file.data_name == yield_file:
            yield_path = dataset_path + file.file_path
            yield_df = pd.read_csv(yield_path)
            break

    # Traverse all the dataset.
    for k in xrange(file_amount):
        file = all_file_param[k]

        if file.data_name not in plot_files:
            continue
        print '==========' + file.data_name + '=========='
        path = dataset_path + file.file_path
        df = pd.read_csv(path)

        # Split the micro training file into training dataset and test dataset.
        X_train, X_test, y_train, y_test, train_data, test_data = split_and_build_class(df.values, yield_df.values)
        # [train_data, test_data] = split_and_build_class(df.values, yield_df.values)

        y_train_binary = Binary_Classification.transform_to_binary(y_train)

        # Run SVM.
        clf = run_regression(X_train[:, 1:], y_train_binary)
        y_hat_test_binary = clf.predict(X_test[:, 1:])
        print 'Number of Class 1 in Training Data:', np.count_nonzero(y_train_binary)
        print 'Number of Class 1 in Test Data:', np.count_nonzero(y_test)
        print 'Number of Class 1 in Predicted Data:', np.count_nonzero(y_hat_test_binary)

        # Run Ridge Regression.
        X_train_regression = []
        y_train_regression = []
        X_test_regression = []
        y_hat_test_regression = []

        for i in xrange(len(y_train_binary)):
            if y_train_binary[i] != 0:
                X_train_regression.append(X_train[i])
                y_train_regression.append(y_train[i])
        X_train_regression = np.array(X_train_regression)

        '''
        =====Regression=====
        '''
        clf_regression = linear_model.Ridge(normalize=True)
        # clf_regression.fit(X_train_regression[:, 1:], y_train_regression)
        clf_regression.fit(X_train[:, 1:], y_train)
        '''
        ====================
        '''


        for i in xrange(len(y_hat_test_binary)):
            if y_hat_test_binary[i] != 0:
                X_test_regression.append(X_test[i])
        X_test_regression = np.array(X_test_regression)
        if len(X_test_regression):
            y_hat_test_regression = clf_regression.predict(X_test_regression[:, 1:])

        j = 0
        y_hat_test = []
        if len(X_test_regression):
            for i in xrange(len(y_hat_test_binary)):
                if y_hat_test_binary[i] == 0:
                    y_hat_test.append(y_hat_test_binary[i])
                else:
                    y_hat_test.append(y_hat_test_regression[j])
                    j += 1
            y_hat_test = np.array(y_hat_test)
        else:
            y_hat_test = y_hat_test_binary

        cmap = plt.get_cmap('jet_r')
        plt.figure(figsize=(10, 10))

        interval = file.interval
        intervel_minute = read_dataset.get_interval_minute(interval)

        test_size = y_hat_test.shape[0]
        plt.plot([i for i in xrange(test_size)], y_hat_test)
        plt.plot([i for i in xrange(test_size)], y_test)
        plt.legend(['Prediction', 'Real'])
        plt.suptitle('Cross Validation + Bagging Classifier + Ridge Regression')
        plt.savefig('Cross Validation + Bagging Classifier + Ridge Regression.png', bbox_inches='tight')

        loss = np.sqrt(mean_squared_error(y_test, y_hat_test))
        print 'Cross Validation + Bagging Classifier + Ridge Regression loss =', loss

        '''
        =======================================================================
        '''

        # Predict test and write submission
        submission_file_name = 'Submission format'
        submission_file = None
        test_file_name = 'Test set Microclimate (2 hour intervals)'
        test_file = None

        for k in xrange(file_amount):
            file = all_file_param[k]
            if file.data_name == submission_file_name:
                submission_file = file
                break
        submission_path = dataset_path + submission_file.file_path
        df_submission = pd.read_csv(submission_path, index_col=0, parse_dates=[0])

        for k in xrange(file_amount):
            file = all_file_param[k]
            if file.data_name == test_file_name:
                test_file = file
                break
        test_path = dataset_path + test_file.file_path
        df_test = pd.read_csv(test_path, index_col=0, parse_dates=[0])

        X_combined = write_submission.combine_table(df_submission, df_test)
        imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
        fixed_X = X_combined.values[:, 0:]
        imp.fit(fixed_X)
        X_combined.values[:, 0:] = imp.transform(fixed_X)
        preprocessing.normalize(X_combined.values, copy=False)
        y_submission = write_submission.write_submission_binary_classifier_and_regression(
            X_combined, clf, clf_regression, df_submission,
            'Cross Validation + Bagging Classifier + Ridge Regression Submission')
Ejemplo n.º 8
0
def main():
    # Read the dataset.
    dataset_path = './dataset/'
    dataset_file_path = './dataset_file_path.csv'
    df_path = pd.read_csv(dataset_file_path)
    all_file_param = read_dataset.read_all_dataset(df_path)
    file_amount = len(all_file_param)
    plot_files = ['Training set Microclimate (2 hour intervals)']
    yield_file = 'Target Variable Water Yield'
    # plot_files = ['Training set Microclimate (2 hour intervals)',
    #               'Training set Microclimate (5 minute intervals)']

    # Set up features for micro files.
    micro_features = read_dataset.set_features(dataset_path, plot_files)

    # Read yield file for micro training file.
    yield_df = None
    for k in xrange(file_amount):
        file = all_file_param[k]
        if file.data_name == yield_file:
            yield_path = dataset_path + file.file_path
            yield_df = pd.read_csv(yield_path)
            break

    # Traverse all the dataset.
    for k in xrange(file_amount):
        file = all_file_param[k]

        if file.data_name not in plot_files:
            continue
        print '==========' + file.data_name + '=========='
        path = dataset_path + file.file_path
        df = pd.read_csv(path)

        # Split the micro training file into training dataset and test dataset.
        X_train, X_test, y_train, y_test, train_data, test_data = split_and_build_class(
            df.values, yield_df.values)
        # [train_data, test_data] = split_and_build_class(df.values, yield_df.values)

        y_train_binary = Binary_Classification.transform_to_binary(y_train)

        # Run SVM.
        clf = run_regression(X_train[:, 1:], y_train_binary)
        y_hat_test_binary = clf.predict(X_test[:, 1:])
        print 'Number of Class 1 in Training Data:', np.count_nonzero(
            y_train_binary)
        print 'Number of Class 1 in Test Data:', np.count_nonzero(y_test)
        print 'Number of Class 1 in Predicted Data:', np.count_nonzero(
            y_hat_test_binary)

        # Run Ridge Regression.
        X_train_regression = []
        y_train_regression = []
        X_test_regression = []
        y_hat_test_regression = []

        for i in xrange(len(y_train_binary)):
            if y_train_binary[i] != 0:
                X_train_regression.append(X_train[i])
                y_train_regression.append(y_train[i])
        X_train_regression = np.array(X_train_regression)
        '''
        =====Regression=====
        '''
        clf_regression = linear_model.Ridge(normalize=True)
        # clf_regression.fit(X_train_regression[:, 1:], y_train_regression)
        clf_regression.fit(X_train[:, 1:], y_train)
        '''
        ====================
        '''

        for i in xrange(len(y_hat_test_binary)):
            if y_hat_test_binary[i] != 0:
                X_test_regression.append(X_test[i])
        X_test_regression = np.array(X_test_regression)
        if len(X_test_regression):
            y_hat_test_regression = clf_regression.predict(
                X_test_regression[:, 1:])

        j = 0
        y_hat_test = []
        if len(X_test_regression):
            for i in xrange(len(y_hat_test_binary)):
                if y_hat_test_binary[i] == 0:
                    y_hat_test.append(y_hat_test_binary[i])
                else:
                    y_hat_test.append(y_hat_test_regression[j])
                    j += 1
            y_hat_test = np.array(y_hat_test)
        else:
            y_hat_test = y_hat_test_binary

        cmap = plt.get_cmap('jet_r')
        plt.figure(figsize=(10, 10))

        interval = file.interval
        intervel_minute = read_dataset.get_interval_minute(interval)

        test_size = y_hat_test.shape[0]
        plt.plot([i for i in xrange(test_size)], y_hat_test)
        plt.plot([i for i in xrange(test_size)], y_test)
        plt.legend(['Prediction', 'Real'])
        plt.suptitle(
            'Cross Validation + Bagging Classifier + Ridge Regression')
        plt.savefig(
            'Cross Validation + Bagging Classifier + Ridge Regression.png',
            bbox_inches='tight')

        loss = np.sqrt(mean_squared_error(y_test, y_hat_test))
        print 'Cross Validation + Bagging Classifier + Ridge Regression loss =', loss
        '''
        =======================================================================
        '''

        # Predict test and write submission
        submission_file_name = 'Submission format'
        submission_file = None
        test_file_name = 'Test set Microclimate (2 hour intervals)'
        test_file = None

        for k in xrange(file_amount):
            file = all_file_param[k]
            if file.data_name == submission_file_name:
                submission_file = file
                break
        submission_path = dataset_path + submission_file.file_path
        df_submission = pd.read_csv(submission_path,
                                    index_col=0,
                                    parse_dates=[0])

        for k in xrange(file_amount):
            file = all_file_param[k]
            if file.data_name == test_file_name:
                test_file = file
                break
        test_path = dataset_path + test_file.file_path
        df_test = pd.read_csv(test_path, index_col=0, parse_dates=[0])

        X_combined = write_submission.combine_table(df_submission, df_test)
        imp = preprocessing.Imputer(missing_values='NaN',
                                    strategy='mean',
                                    axis=0)
        fixed_X = X_combined.values[:, 0:]
        imp.fit(fixed_X)
        X_combined.values[:, 0:] = imp.transform(fixed_X)
        preprocessing.normalize(X_combined.values, copy=False)
        y_submission = write_submission.write_submission_binary_classifier_and_regression(
            X_combined, clf, clf_regression, df_submission,
            'Cross Validation + Bagging Classifier + Ridge Regression Submission'
        )