def split_and_build_class(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) print X_train.shape print X_test.shape # Normalize the input data. imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0) fixed_X_train = X_train[:, 1:] imp.fit(fixed_X_train) fixed_X_train = imp.transform(fixed_X_train) preprocessing.normalize(fixed_X_train, copy=False) X_train[:, 1:] = fixed_X_train fixed_X_test = X_test[:, 1:] imp.fit(fixed_X_test) fixed_X_test = imp.transform(fixed_X_test) preprocessing.normalize(fixed_X_test, copy=False) X_test[:, 1:] = fixed_X_test train_data = read_dataset.microData() train_data.get_data(X_train) y_train = train_data.set_output(y_train) test_data = read_dataset.microData() test_data.get_data(X_test) y_test = test_data.set_output(y_test) return [X_train, X_test, y_train, y_test, train_data, test_data]
def plot_one_feature(plot_files, all_file_param, dataset_path, feature): file_amount = len(all_file_param) # Initiate the plot. cmap = plt.get_cmap('jet_r') plt.figure(figsize=(25, 10)) # plot_for_legend = plt.subplot() # Color setup for single file. color = cmap(float(9) / file_amount) # Traverse all the dataset. print '==========' + feature + '==========' for k in xrange(file_amount): file = all_file_param[k] # Color setup for multiple files. # color = cmap(float(k) / file_amount) # Use this block to set which dataset you want to find missing intervals. if file.data_name not in plot_files: continue print 'Ploting ' + file.data_name + '...' path = dataset_path + file.file_path df = pd.read_csv(path) all_data = read_dataset.microData() all_data.get_data(df.values) interval = file.interval intervel_minute = read_dataset.get_interval_minute(interval) prev_time = all_data.data_time[0] for i in xrange(1, len(all_data.data_time)): current_time = all_data.data_time[i] diff = current_time - prev_time if not diff.days and diff.seconds / 60 <= intervel_minute: plt.plot( [prev_time, current_time], all_data.read_feature(feature)[i - 1: i + 1], c=color ) prev_time = current_time if i % 1000 == 0: print 'Plotted ', i, ' lines...' print 'Plotted done!' plt.legend(plot_files) plt.suptitle(feature + ' in ' + str(plot_files)) plt.savefig('./micro_features_plot/' + feature + '.png', bbox_inches='tight')
def plot_one_feature(plot_files, all_file_param, dataset_path, feature): file_amount = len(all_file_param) # Initiate the plot. cmap = plt.get_cmap('jet_r') plt.figure(figsize=(25, 10)) # plot_for_legend = plt.subplot() # Color setup for single file. color = cmap(float(9) / file_amount) # Traverse all the dataset. print '==========' + feature + '==========' for k in xrange(file_amount): file = all_file_param[k] # Color setup for multiple files. # color = cmap(float(k) / file_amount) # Use this block to set which dataset you want to find missing intervals. if file.data_name not in plot_files: continue print 'Ploting ' + file.data_name + '...' path = dataset_path + file.file_path df = pd.read_csv(path) all_data = read_dataset.microData() all_data.get_data(df.values) interval = file.interval intervel_minute = read_dataset.get_interval_minute(interval) prev_time = all_data.data_time[0] for i in xrange(1, len(all_data.data_time)): current_time = all_data.data_time[i] diff = current_time - prev_time if not diff.days and diff.seconds / 60 <= intervel_minute: plt.plot([prev_time, current_time], all_data.read_feature(feature)[i - 1:i + 1], c=color) prev_time = current_time if i % 1000 == 0: print 'Plotted ', i, ' lines...' print 'Plotted done!' plt.legend(plot_files) plt.suptitle(feature + ' in ' + str(plot_files)) plt.savefig('./micro_features_plot/' + feature + '.png', bbox_inches='tight')