def main(): # Read the dataset. dataset_path = './dataset/' dataset_file_path = './dataset_file_path.csv' df_path = pd.read_csv(dataset_file_path) all_file_param = read_dataset.read_all_dataset(df_path) file_amount = len(all_file_param) plot_files = ['Training set Microclimate (2 hour intervals)'] yield_file = 'Target Variable Water Yield' # plot_files = ['Training set Microclimate (2 hour intervals)', # 'Training set Microclimate (5 minute intervals)'] # Set up features for micro files. micro_features = read_dataset.set_features(dataset_path, plot_files) # Read yield file for micro training file. yield_df = None for k in xrange(file_amount): file = all_file_param[k] if file.data_name == yield_file: yield_path = dataset_path + file.file_path yield_df = pd.read_csv(yield_path) break # Traverse all the dataset. for k in xrange(file_amount): file = all_file_param[k] if file.data_name not in plot_files: continue print '==========' + file.data_name + '==========' path = dataset_path + file.file_path df = pd.read_csv(path) # Split the micro training file into training dataset and test dataset. X_train, X_test, y_train, y_test, train_data, test_data = split_and_build_class( df.values, yield_df.values) # [train_data, test_data] = split_and_build_class(df.values, yield_df.values) # Run Ridge Regression. clf = run_regression(X_train[:, 1:], y_train) y_hat_test = clf.predict(X_test[:, 1:]) cmap = plt.get_cmap('jet_r') plt.figure(figsize=(10, 10)) interval = file.interval intervel_minute = read_dataset.get_interval_minute(interval) test_size = y_hat_test.shape[0] plt.plot([i for i in xrange(test_size)], y_hat_test) plt.plot([i for i in xrange(test_size)], y_test) plt.legend(['Prediction', 'Real']) plt.suptitle('Cross validation + Bagging Regressor') plt.savefig('Cross validation + Bagging Regressor.png', bbox_inches='tight') loss = np.sqrt(mean_squared_error(y_test, y_hat_test)) print 'Cross validation + Bagging Regressor loss =', loss
def main(): # Read the dataset. dataset_path = './dataset/' dataset_file_path = './dataset_file_path.csv' df_path = pd.read_csv(dataset_file_path) all_file_param = read_dataset.read_all_dataset(df_path) file_amount = len(all_file_param) plot_files = ['Training set Microclimate (2 hour intervals)'] yield_file = 'Target Variable Water Yield' # plot_files = ['Training set Microclimate (2 hour intervals)', # 'Training set Microclimate (5 minute intervals)'] # Set up features for micro files. micro_features = read_dataset.set_features(dataset_path, plot_files) # Read yield file for micro training file. yield_df = None for k in xrange(file_amount): file = all_file_param[k] if file.data_name == yield_file: yield_path = dataset_path + file.file_path yield_df = pd.read_csv(yield_path) break # Traverse all the dataset. for k in xrange(file_amount): file = all_file_param[k] if file.data_name not in plot_files: continue print '==========' + file.data_name + '==========' path = dataset_path + file.file_path df = pd.read_csv(path) # Split the micro training file into training dataset and test dataset. X_train, X_test, y_train, y_test, train_data, test_data = split_and_build_class(df.values, yield_df.values) # [train_data, test_data] = split_and_build_class(df.values, yield_df.values) # Run Ridge Regression. clf = run_regression(X_train[:, 1:], y_train) y_hat_test = clf.predict(X_test[:, 1:]) cmap = plt.get_cmap('jet_r') plt.figure(figsize=(10, 10)) interval = file.interval intervel_minute = read_dataset.get_interval_minute(interval) test_size = y_hat_test.shape[0] plt.plot([i for i in xrange(test_size)], y_hat_test) plt.plot([i for i in xrange(test_size)], y_test) plt.legend(['Prediction', 'Real']) plt.suptitle('Cross validation + Bagging Regressor') plt.savefig('Cross validation + Bagging Regressor.png', bbox_inches='tight') loss = np.sqrt(mean_squared_error(y_test, y_hat_test)) print 'Cross validation + Bagging Regressor loss =', loss
def plot_one_feature(plot_files, all_file_param, dataset_path, feature): file_amount = len(all_file_param) # Initiate the plot. cmap = plt.get_cmap('jet_r') plt.figure(figsize=(25, 10)) # plot_for_legend = plt.subplot() # Color setup for single file. color = cmap(float(9) / file_amount) # Traverse all the dataset. print '==========' + feature + '==========' for k in xrange(file_amount): file = all_file_param[k] # Color setup for multiple files. # color = cmap(float(k) / file_amount) # Use this block to set which dataset you want to find missing intervals. if file.data_name not in plot_files: continue print 'Ploting ' + file.data_name + '...' path = dataset_path + file.file_path df = pd.read_csv(path) all_data = read_dataset.microData() all_data.get_data(df.values) interval = file.interval intervel_minute = read_dataset.get_interval_minute(interval) prev_time = all_data.data_time[0] for i in xrange(1, len(all_data.data_time)): current_time = all_data.data_time[i] diff = current_time - prev_time if not diff.days and diff.seconds / 60 <= intervel_minute: plt.plot( [prev_time, current_time], all_data.read_feature(feature)[i - 1: i + 1], c=color ) prev_time = current_time if i % 1000 == 0: print 'Plotted ', i, ' lines...' print 'Plotted done!' plt.legend(plot_files) plt.suptitle(feature + ' in ' + str(plot_files)) plt.savefig('./micro_features_plot/' + feature + '.png', bbox_inches='tight')
def plot_one_feature(plot_files, all_file_param, dataset_path, feature): file_amount = len(all_file_param) # Initiate the plot. cmap = plt.get_cmap('jet_r') plt.figure(figsize=(25, 10)) # plot_for_legend = plt.subplot() # Color setup for single file. color = cmap(float(9) / file_amount) # Traverse all the dataset. print '==========' + feature + '==========' for k in xrange(file_amount): file = all_file_param[k] # Color setup for multiple files. # color = cmap(float(k) / file_amount) # Use this block to set which dataset you want to find missing intervals. if file.data_name not in plot_files: continue print 'Ploting ' + file.data_name + '...' path = dataset_path + file.file_path df = pd.read_csv(path) all_data = read_dataset.microData() all_data.get_data(df.values) interval = file.interval intervel_minute = read_dataset.get_interval_minute(interval) prev_time = all_data.data_time[0] for i in xrange(1, len(all_data.data_time)): current_time = all_data.data_time[i] diff = current_time - prev_time if not diff.days and diff.seconds / 60 <= intervel_minute: plt.plot([prev_time, current_time], all_data.read_feature(feature)[i - 1:i + 1], c=color) prev_time = current_time if i % 1000 == 0: print 'Plotted ', i, ' lines...' print 'Plotted done!' plt.legend(plot_files) plt.suptitle(feature + ' in ' + str(plot_files)) plt.savefig('./micro_features_plot/' + feature + '.png', bbox_inches='tight')
def main(): # Read the dataset. dataset_path = './dataset/' dataset_file_path = './dataset_file_path.csv' df_path = pd.read_csv(dataset_file_path) all_file_param = read_dataset.read_all_dataset(df_path) file_amount = len(all_file_param) plot_files = ['Training set Microclimate (2 hour intervals)'] yield_file = 'Target Variable Water Yield' # plot_files = ['Training set Microclimate (2 hour intervals)', # 'Training set Microclimate (5 minute intervals)'] # Set up features for micro files. micro_features = read_dataset.set_features(dataset_path, plot_files) # Read yield file for micro training file. yield_df = None for k in xrange(file_amount): file = all_file_param[k] if file.data_name == yield_file: yield_path = dataset_path + file.file_path yield_df = pd.read_csv(yield_path) break # Traverse all the dataset. for k in xrange(file_amount): file = all_file_param[k] if file.data_name not in plot_files: continue print '==========' + file.data_name + '==========' path = dataset_path + file.file_path df = pd.read_csv(path) # Split the micro training file into training dataset and test dataset. X_train, X_test, y_train, y_test, train_data, test_data = split_and_build_class(df.values, yield_df.values) # [train_data, test_data] = split_and_build_class(df.values, yield_df.values) # Run Ridge Regression. clf = run_regression(X_train[:, 1:], y_train) y_hat_test = clf.predict(X_test[:, 1:]) cmap = plt.get_cmap('jet_r') plt.figure(figsize=(10, 10)) interval = file.interval intervel_minute = read_dataset.get_interval_minute(interval) test_size = y_hat_test.shape[0] plt.plot([i for i in xrange(test_size)], y_hat_test) plt.plot([i for i in xrange(test_size)], y_test) plt.legend(['Prediction', 'Real']) plt.suptitle('Cross validation + Random Forest Regressor') plt.savefig('Cross validation + Random Forest Regressor.png', bbox_inches='tight') loss = np.sqrt(mean_squared_error(y_test, y_hat_test)) print 'Cross validation + Random Forest Regressor loss =', loss ''' ======================================================================= ''' # Predict test and write submission submission_file_name = 'Submission format' submission_file = None test_file_name = 'Test set Microclimate (2 hour intervals)' test_file = None for k in xrange(file_amount): file = all_file_param[k] if file.data_name == submission_file_name: submission_file = file break submission_path = dataset_path + submission_file.file_path df_submission = pd.read_csv(submission_path, index_col=0, parse_dates=[0]) for k in xrange(file_amount): file = all_file_param[k] if file.data_name == test_file_name: test_file = file break test_path = dataset_path + test_file.file_path df_test = pd.read_csv(test_path, index_col=0, parse_dates=[0]) X_combined = write_submission.combine_table(df_submission, df_test) imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0) fixed_X = X_combined.values[:, 0:] imp.fit(fixed_X) X_combined.values[:, 0:] = imp.transform(fixed_X) preprocessing.normalize(X_combined.values, copy=False) y_submission = write_submission.write_submission( X_combined, clf, df_submission, 'Cross Valication + Random Forest Regressor Submission')
def main(): # Read the dataset. dataset_path = './dataset/' dataset_file_path = './dataset_file_path.csv' df_path = pd.read_csv(dataset_file_path) all_file_param = read_dataset.read_all_dataset(df_path) file_amount = len(all_file_param) # Initiate the plot. cmap = plt.get_cmap('jet_r') plt.figure(figsize=(10, 10)) plot_for_legend = plt.subplot() # Traverse all the dataset. for k in xrange(file_amount): file = all_file_param[k] # Use this block to set which dataset you want to find missing intervals. # if file.data_name != 'Macroclimate Guelmim Airport': # continue print '==========' + file.data_name + '==========' path = dataset_path + file.file_path df = pd.read_csv(path) data_time = df.values[:, 0] interval = file.interval start_time = datetime.datetime.strptime(file.start_time,'%H:%M') end_time = datetime.datetime.strptime(file.end_time,'%H:%M') missing_data = [] prev_time = read_dataset.assign_time(data_time[0]) current_time = None color = cmap(float(k) / file_amount) for i in range(1, data_time.shape[0]): tmp = data_time[i] current_time = read_dataset.assign_time(tmp) diff = current_time - prev_time # Find out missing intervals with gap larger that the default interval. if diff.days or diff.seconds / 60 > read_dataset.get_interval_minute(interval): missing_interval = [prev_time, current_time] missing_data += fix_interval(start_time, end_time, missing_interval) prev_time = current_time # Output all the missing intervals. for item in missing_data: print item[0].strftime('%Y-%m-%d %H:%M:%S'), \ item[1].strftime('%Y-%m-%d %H:%M:%S') print len(missing_data), ' missing intervals are found.' # Plot the missing intervals. for item in missing_data: plt.plot(item, [(k + 1) for j in xrange(2)], c=color) plt.ylim([0, file_amount + 1]) plot_for_legend.plot([], [], c=color, label=file.data_name) # Set the position, legend, and subtitle of the plot. box = plot_for_legend.get_position() plot_for_legend.set_position([box.x0, box.y0, box.width, box.height * 0.6]) plot_for_legend.set_position([box.x0, box.y0 + box.height * 0.2, box.width, box.height * 0.8]) legend = plot_for_legend.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=2) plt.suptitle('Missing Intervals for the Dataset') plt.savefig('missing_intervals_for_the_dataset.png')
def main(): # Read the dataset. dataset_path = './dataset/' dataset_file_path = './dataset_file_path.csv' df_path = pd.read_csv(dataset_file_path) all_file_param = read_dataset.read_all_dataset(df_path) file_amount = len(all_file_param) plot_files = ['Training set Microclimate (2 hour intervals)'] yield_file = 'Target Variable Water Yield' # plot_files = ['Training set Microclimate (2 hour intervals)', # 'Training set Microclimate (5 minute intervals)'] # Set up features for micro files. micro_features = read_dataset.set_features(dataset_path, plot_files) # Read yield file for micro training file. yield_df = None for k in xrange(file_amount): file = all_file_param[k] if file.data_name == yield_file: yield_path = dataset_path + file.file_path yield_df = pd.read_csv(yield_path) break # Traverse all the dataset. for k in xrange(file_amount): file = all_file_param[k] if file.data_name not in plot_files: continue print '==========' + file.data_name + '==========' path = dataset_path + file.file_path df = pd.read_csv(path) # Split the micro training file into training dataset and test dataset. X_train, X_test, y_train, y_test, train_data, test_data = split_and_build_class(df.values, yield_df.values) # [train_data, test_data] = split_and_build_class(df.values, yield_df.values) y_train_binary = Binary_Classification.transform_to_binary(y_train) # Run SVM. clf = run_regression(X_train[:, 1:], y_train_binary) y_hat_test_binary = clf.predict(X_test[:, 1:]) print 'Number of Class 1 in Training Data:', np.count_nonzero(y_train_binary) print 'Number of Class 1 in Test Data:', np.count_nonzero(y_test) print 'Number of Class 1 in Predicted Data:', np.count_nonzero(y_hat_test_binary) # Run Ridge Regression. X_train_regression = [] y_train_regression = [] X_test_regression = [] y_hat_test_regression = [] for i in xrange(len(y_train_binary)): if y_train_binary[i] != 0: X_train_regression.append(X_train[i]) y_train_regression.append(y_train[i]) X_train_regression = np.array(X_train_regression) ''' =====Regression===== ''' clf_regression = linear_model.Ridge(normalize=True) # clf_regression.fit(X_train_regression[:, 1:], y_train_regression) clf_regression.fit(X_train[:, 1:], y_train) ''' ==================== ''' for i in xrange(len(y_hat_test_binary)): if y_hat_test_binary[i] != 0: X_test_regression.append(X_test[i]) X_test_regression = np.array(X_test_regression) if len(X_test_regression): y_hat_test_regression = clf_regression.predict(X_test_regression[:, 1:]) j = 0 y_hat_test = [] if len(X_test_regression): for i in xrange(len(y_hat_test_binary)): if y_hat_test_binary[i] == 0: y_hat_test.append(y_hat_test_binary[i]) else: y_hat_test.append(y_hat_test_regression[j]) j += 1 y_hat_test = np.array(y_hat_test) else: y_hat_test = y_hat_test_binary cmap = plt.get_cmap('jet_r') plt.figure(figsize=(10, 10)) interval = file.interval intervel_minute = read_dataset.get_interval_minute(interval) test_size = y_hat_test.shape[0] plt.plot([i for i in xrange(test_size)], y_hat_test) plt.plot([i for i in xrange(test_size)], y_test) plt.legend(['Prediction', 'Real']) plt.suptitle('Cross Validation + Bagging Classifier + Ridge Regression') plt.savefig('Cross Validation + Bagging Classifier + Ridge Regression.png', bbox_inches='tight') loss = np.sqrt(mean_squared_error(y_test, y_hat_test)) print 'Cross Validation + Bagging Classifier + Ridge Regression loss =', loss ''' ======================================================================= ''' # Predict test and write submission submission_file_name = 'Submission format' submission_file = None test_file_name = 'Test set Microclimate (2 hour intervals)' test_file = None for k in xrange(file_amount): file = all_file_param[k] if file.data_name == submission_file_name: submission_file = file break submission_path = dataset_path + submission_file.file_path df_submission = pd.read_csv(submission_path, index_col=0, parse_dates=[0]) for k in xrange(file_amount): file = all_file_param[k] if file.data_name == test_file_name: test_file = file break test_path = dataset_path + test_file.file_path df_test = pd.read_csv(test_path, index_col=0, parse_dates=[0]) X_combined = write_submission.combine_table(df_submission, df_test) imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0) fixed_X = X_combined.values[:, 0:] imp.fit(fixed_X) X_combined.values[:, 0:] = imp.transform(fixed_X) preprocessing.normalize(X_combined.values, copy=False) y_submission = write_submission.write_submission_binary_classifier_and_regression( X_combined, clf, clf_regression, df_submission, 'Cross Validation + Bagging Classifier + Ridge Regression Submission')
def main(): # Read the dataset. dataset_path = './dataset/' dataset_file_path = './dataset_file_path.csv' df_path = pd.read_csv(dataset_file_path) all_file_param = read_dataset.read_all_dataset(df_path) file_amount = len(all_file_param) plot_files = ['Training set Microclimate (2 hour intervals)'] yield_file = 'Target Variable Water Yield' # plot_files = ['Training set Microclimate (2 hour intervals)', # 'Training set Microclimate (5 minute intervals)'] # Set up features for micro files. micro_features = read_dataset.set_features(dataset_path, plot_files) # Read yield file for micro training file. yield_df = None for k in xrange(file_amount): file = all_file_param[k] if file.data_name == yield_file: yield_path = dataset_path + file.file_path yield_df = pd.read_csv(yield_path) break # Traverse all the dataset. for k in xrange(file_amount): file = all_file_param[k] if file.data_name not in plot_files: continue print '==========' + file.data_name + '==========' path = dataset_path + file.file_path df = pd.read_csv(path) # Split the micro training file into training dataset and test dataset. X_train, X_test, y_train, y_test, train_data, test_data = split_and_build_class( df.values, yield_df.values) # [train_data, test_data] = split_and_build_class(df.values, yield_df.values) y_train_binary = Binary_Classification.transform_to_binary(y_train) # Run SVM. clf = run_regression(X_train[:, 1:], y_train_binary) y_hat_test_binary = clf.predict(X_test[:, 1:]) print 'Number of Class 1 in Training Data:', np.count_nonzero( y_train_binary) print 'Number of Class 1 in Test Data:', np.count_nonzero(y_test) print 'Number of Class 1 in Predicted Data:', np.count_nonzero( y_hat_test_binary) # Run Ridge Regression. X_train_regression = [] y_train_regression = [] X_test_regression = [] y_hat_test_regression = [] for i in xrange(len(y_train_binary)): if y_train_binary[i] != 0: X_train_regression.append(X_train[i]) y_train_regression.append(y_train[i]) X_train_regression = np.array(X_train_regression) ''' =====Regression===== ''' clf_regression = linear_model.Ridge(normalize=True) # clf_regression.fit(X_train_regression[:, 1:], y_train_regression) clf_regression.fit(X_train[:, 1:], y_train) ''' ==================== ''' for i in xrange(len(y_hat_test_binary)): if y_hat_test_binary[i] != 0: X_test_regression.append(X_test[i]) X_test_regression = np.array(X_test_regression) if len(X_test_regression): y_hat_test_regression = clf_regression.predict( X_test_regression[:, 1:]) j = 0 y_hat_test = [] if len(X_test_regression): for i in xrange(len(y_hat_test_binary)): if y_hat_test_binary[i] == 0: y_hat_test.append(y_hat_test_binary[i]) else: y_hat_test.append(y_hat_test_regression[j]) j += 1 y_hat_test = np.array(y_hat_test) else: y_hat_test = y_hat_test_binary cmap = plt.get_cmap('jet_r') plt.figure(figsize=(10, 10)) interval = file.interval intervel_minute = read_dataset.get_interval_minute(interval) test_size = y_hat_test.shape[0] plt.plot([i for i in xrange(test_size)], y_hat_test) plt.plot([i for i in xrange(test_size)], y_test) plt.legend(['Prediction', 'Real']) plt.suptitle( 'Cross Validation + Bagging Classifier + Ridge Regression') plt.savefig( 'Cross Validation + Bagging Classifier + Ridge Regression.png', bbox_inches='tight') loss = np.sqrt(mean_squared_error(y_test, y_hat_test)) print 'Cross Validation + Bagging Classifier + Ridge Regression loss =', loss ''' ======================================================================= ''' # Predict test and write submission submission_file_name = 'Submission format' submission_file = None test_file_name = 'Test set Microclimate (2 hour intervals)' test_file = None for k in xrange(file_amount): file = all_file_param[k] if file.data_name == submission_file_name: submission_file = file break submission_path = dataset_path + submission_file.file_path df_submission = pd.read_csv(submission_path, index_col=0, parse_dates=[0]) for k in xrange(file_amount): file = all_file_param[k] if file.data_name == test_file_name: test_file = file break test_path = dataset_path + test_file.file_path df_test = pd.read_csv(test_path, index_col=0, parse_dates=[0]) X_combined = write_submission.combine_table(df_submission, df_test) imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0) fixed_X = X_combined.values[:, 0:] imp.fit(fixed_X) X_combined.values[:, 0:] = imp.transform(fixed_X) preprocessing.normalize(X_combined.values, copy=False) y_submission = write_submission.write_submission_binary_classifier_and_regression( X_combined, clf, clf_regression, df_submission, 'Cross Validation + Bagging Classifier + Ridge Regression Submission' )