def test_min_max_scaler_iris(): X = iris.data scaler = MinMaxScaler() # default params X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.max(axis=0), 1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # not default params: min=1, max=2 scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 1) assert_array_almost_equal(X_trans.max(axis=0), 2) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # min=-.5, max=.6 scaler = MinMaxScaler(feature_range=(-.5, .6)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), -.5) assert_array_almost_equal(X_trans.max(axis=0), .6) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # raises on invalid range scaler = MinMaxScaler(feature_range=(2, 1)) assert_raises(ValueError, scaler.fit, X)
def test_min_max_scaler_zero_variance_features(): # Check min max scaler on toy data with zero variance features X = [[0., 1., +0.5], [0., 1., -0.1], [0., 1., +1.1]] X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]] # default params scaler = MinMaxScaler() X_trans = scaler.fit_transform(X) X_expected_0_1 = [[0., 0., 0.5], [0., 0., 0.0], [0., 0., 1.0]] assert_array_almost_equal(X_trans, X_expected_0_1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) X_trans_new = scaler.transform(X_new) X_expected_0_1_new = [[+0., 1., 0.500], [-1., 0., 0.083], [+0., 0., 1.333]] assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) # not default params scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) X_expected_1_2 = [[1., 1., 1.5], [1., 1., 1.0], [1., 1., 2.0]] assert_array_almost_equal(X_trans, X_expected_1_2) # function interface X_trans = minmax_scale(X) assert_array_almost_equal(X_trans, X_expected_0_1) X_trans = minmax_scale(X, feature_range=(1, 2)) assert_array_almost_equal(X_trans, X_expected_1_2)
def test_min_max_scaler_zero_variance_features(): """Check min max scaler on toy data with zero variance features""" X = [[0., 1., +0.5], [0., 1., -0.1], [0., 1., +1.1]] X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]] # default params scaler = MinMaxScaler() X_trans = scaler.fit_transform(X) X_expected_0_1 = [[0., 0., 0.5], [0., 0., 0.0], [0., 0., 1.0]] assert_array_almost_equal(X_trans, X_expected_0_1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) X_trans_new = scaler.transform(X_new) X_expected_0_1_new = [[+0., 1., 0.500], [-1., 0., 0.083], [+0., 0., 1.333]] assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) # not default params scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) X_expected_1_2 = [[1., 1., 1.5], [1., 1., 1.0], [1., 1., 2.0]] assert_array_almost_equal(X_trans, X_expected_1_2)
def spambase_transform(input_path, features_path, labels_path, metadata_path): metadata = create_metadata( VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {}, sum(NUM_SAMPLES), CLASSES) input_file = open(input_path, "r") features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform i = 0 line = input_file.readline() while line != "": line = line.rstrip("\n") values = line.split(",") assert len(values) - 1 == len(VARIABLES), str( (len(values) - 1, len(VARIABLES))) for j, value in enumerate(values[:-1]): value = float(value) features[i, j] = value labels[i] = int(values[-1]) i += 1 line = input_file.readline() # scale scaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler.fit_transform(features) assert i == metadata["num_samples"] num_positive_samples = int(labels.sum()) num_negative_samples = labels.shape[0] - num_positive_samples assert num_negative_samples == NUM_SAMPLES[0] assert num_positive_samples == NUM_SAMPLES[1] print("Negative samples: ", num_negative_samples) print("Positive samples: ", num_positive_samples) print("Total samples: ", features.shape[0]) print("Features: ", features.shape[1]) np.save(features_path, features) np.save(labels_path, labels) input_file.close() metadata["features_min"] = scaler.data_min_.tolist() metadata["features_max"] = scaler.data_max_.tolist() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def process_data(): file_path = '/Users/fpena/Stuff/House Search/Dublin/viewings-ucd.csv' data_frame = pandas.read_csv(file_path) print(data_frame.columns.values.tolist()) print(data_frame.head()) print(data_frame.describe()) print(data_frame['Price']) price_scaler = MinMaxScaler() data_frame['Price Score'] = 1 - price_scaler.fit_transform( data_frame[['Price']]) data_frame['Cycle Time Score'] = 1 - price_scaler.fit_transform( data_frame[['Cycle Time']]) data_frame['Score'] = 0.5 * (data_frame['Price Score'] + data_frame['Cycle Time Score']) data_frame['Rank'] = data_frame['Score'].rank( ascending=True) / (len(data_frame)) cycle_hour_cost = 30 working_days_per_month = 22 data_frame['Money Score'] =\ data_frame['Price'] + data_frame['Cycle Time'] / 60 * cycle_hour_cost * working_days_per_month data_frame.rename(columns={'Cycle Time': 'Cycle'}, inplace=True) # print(data_frame['Price Score']) # print(data_frame[['Score', 'Rank']]) # with pandas.option_context('display.max_rows', 500, 'display.max_columns', 10): # print(data_frame[['Address', 'Price', 'Cycle', 'Rank', 'Score']].sort_values('Rank', ascending=False)) # print(data_frame[['Address', 'Price', 'Cycle', 'Rank', 'Score', 'Money Score']].to_string()) print(data_frame[[ 'Address', 'Price', 'Cycle', 'Rank', 'Score', 'Money Score' ]].sort_values('Rank', ascending=False).to_string()) # seaborn.(x='Price', y='Cycle Time', data_frame=data_frame) data_frame.plot.scatter(x='Price', y='Cycle') pyplot.savefig('/tmp/daft_scatter.pdf') pyplot.cla() pyplot.clf() data_frame.plot.scatter(x='Price Score', y='Cycle Time Score') pyplot.savefig('/tmp/daft_scatter_norm.pdf') pyplot.cla() pyplot.clf() seaborn.stripplot(x='Accommodation Type', y='Price', data=data_frame, jitter=True) pyplot.savefig('/tmp/daft_price.pdf') pyplot.cla() pyplot.clf() data_frame.plot.scatter(x='Housemates', y='Price') pyplot.savefig('/tmp/daft_scatter_price_housemates.pdf') pyplot.cla() pyplot.clf() data_frame.to_csv('/tmp/daft-houses-processed.csv')
def letter_recognition_transform(input_path, features_path, labels_path, metadata_path): metadata = create_metadata( VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {}, sum(NUM_SAMPLES), CLASSES) input_file = open(input_path, "r") features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform i = 0 line = input_file.readline() while line != "": line = line.rstrip("\n") values = line.split(",") assert len(values) - 1 == len(VARIABLES), str( (len(values) - 1, len(VARIABLES))) for j, value in enumerate(values[1:]): value = float(value) features[i, j] = value labels[i] = CLASS_TO_INDEX[values[0]] i += 1 line = input_file.readline() # scale scaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler.fit_transform(features) assert i == metadata["num_samples"] for class_index in range(len(NUM_SAMPLES)): num_samples_class = (labels == class_index).sum() assert num_samples_class == NUM_SAMPLES[class_index] print("Total samples: ", features.shape[0]) print("Features: ", features.shape[1]) np.save(features_path, features) np.save(labels_path, labels) input_file.close() metadata["features_min"] = scaler.data_min_.tolist() metadata["features_max"] = scaler.data_max_.tolist() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def default_credit_card_transform(input_path, features_path, labels_path, metadata_path): input_file = open(input_path, "r") reader = csv.DictReader(input_file) variables = set(reader.fieldnames) variables.remove("ID") variables.remove("default payment next month") metadata = create_metadata(variables, TYPES, VALUES, NUM_SAMPLES, CLASSES) features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform for i, row in enumerate(reader): # the categorical variables are already one hot encoded for j, variable in enumerate(metadata["variables"]): value = row[variable] if TYPES[variable] == "numerical": value = float(value) features[i, metadata["value_to_index"][variable]] = value elif TYPES[variable] == "categorical": value = value.replace(".0", "") assert value in VALUES[variable], \ "'{}' is not a valid value for '{}'".format(value, variable) features[i, metadata["value_to_index"][variable][value]] = 1.0 # the class needs to be transformed labels[i] = int(row["default payment next month"].replace(".0", "")) # scale scaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler.fit_transform(features) assert i == metadata["num_samples"] - 1 num_positive_samples = int(labels.sum()) num_negative_samples = labels.shape[0] - num_positive_samples print("Negative samples: ", num_negative_samples) print("Positive samples: ", num_positive_samples) print("Total samples: ", features.shape[0]) print("Features: ", features.shape[1]) np.save(features_path, features) np.save(labels_path, labels) input_file.close() metadata["features_min"] = scaler.data_min_.tolist() metadata["features_max"] = scaler.data_max_.tolist() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def load_data(n_samples, label_scaling: bool = False): """Take in Brian's data and spit out some numpy arrays for the PAL""" df_full_factorial_feat = pd.read_csv( os.path.join(DATADIR, 'new_features_full_random.csv'))[FEATURES].values a2 = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values gibbs = pd.read_csv(os.path.join( DATADIR, 'b1-b21_random_deltaG.csv'))['deltaGmin'].values * (-1) gibbs_max = pd.read_csv( os.path.join(DATADIR, 'b1-b21_random_virial_large_new.csv'))['deltaGmax'].values rg = pd.read_csv(os.path.join(DATADIR, 'rg_results.csv'))['Rg'].values y = np.hstack( [rg.reshape(-1, 1), gibbs.reshape(-1, 1), gibbs_max.reshape(-1, 1)]) assert len(df_full_factorial_feat) == len(a2) == len(gibbs) == len(y) feat_scaler = StandardScaler() X = feat_scaler.fit_transform(df_full_factorial_feat) if label_scaling: label_scaler = MinMaxScaler() y = label_scaler.fit_transform(y) greedy_indices = get_maxmin_samples(X, n_samples) nan_indices = np.unique(np.random.randint(0, len(y) - 1, int(len(y) / 3))) y[nan_indices, 2] = np.nan return X, y, greedy_indices
def parkinsons_replicated_data(self, park_dat): df_parkinson = pd.read_csv(park_dat, sep=',') ylabel = df_parkinson['Status'] xfeatures = df_parkinson.drop(['Status', 'ID'], axis=1) xfeats = df_parkinson.drop(['Status', 'ID'], axis=1).values x = (xfeats - np.min(xfeats)) / (np.max(xfeats) - np.min(xfeats)) y = df_parkinson['Status'].values xfeatsp = pd.DataFrame(xfeatures) minmax_scaling = MinMaxScaler() x_scaledp = minmax_scaling.fit_transform(xfeatsp) x_scaledp = pd.DataFrame(x_scaledp) f1 = plt.figure(figsize=(19, 16)) plt.matshow(x_scaledp.corr(), fignum=f1.number) plt.xticks(range(x_scaledp.shape[1]), x_scaledp.columns, fontsize=10, rotation=45) plt.xticks(range(x_scaledp.shape[1]), x_scaledp.columns, fontsize=10) cb = plt.colorbar() cb.ax.tick_params(labelsize=12) plt.show() for eachx in xfeatures: xfeatures[eachx] = (xfeatures[eachx] - xfeatures[eachx].min() ) / xfeatures[eachx].max() ylabel = ylabel.values # ydata = ylabel[:, None] xdata = x_scaledp.to_numpy() targets = np.array(ylabel).reshape(-1) y = np.eye(2)[targets] xtrain, xtest, y_train, y_test = train_test_split( x, y, test_size=0.30) #, shuffle=False) print(y_test) #y_train = ytrain[:, None] #y_test = ytest[:, None] return xtrain, xtest, y_train, y_test
def pearson(A, B, scale=True): correlation = 0 if scale: scaler = MinMaxScaler() A = scaler.fit_transform(A) B = scaler.fit_transform(B) for i in range(A.shape[1]): correlation = correlation + pearsonr(A[:, i], B[:, i])[0] return correlation / A.shape[1]
def calculate_scores(data_frame): data_frame = filter_data(data_frame) min_max_scaler = MinMaxScaler() data_frame['cycle_time'] = data_frame['distance_to_ucd'] * 6 data_frame['price_score'] = 1 - min_max_scaler.fit_transform( data_frame[['price']]) data_frame['cycle_time_score'] = 1 - min_max_scaler.fit_transform( data_frame[['cycle_time']]) data_frame['money'] = \ data_frame['price'] + data_frame['cycle_time'] * 22 * TIME_PRICE_PER_HOUR / 60 data_frame['money_score'] = 1 - min_max_scaler.fit_transform( data_frame[['money']]) data_frame['score'] =\ data_frame['price_score'] + data_frame['cycle_time_score'] data_frame['score'] = min_max_scaler.fit_transform(data_frame[['score']]) data_frame['money_rank'] = data_frame['money'].rank( ascending=False) / (len(data_frame)) pandas.options.display.max_colwidth = 200 return data_frame
def classify(X_train, y_train, X_test, y_test): scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) liberatore_NB = GaussianNB() liberatore_NB.fit(X_train, y_train) del X_train X_test = scaler.transform(X_test) predictions = liberatore_NB.predict(X_test) return y_test, predictions
def classifier_dyer2012(X_train, y_train, X_test, y_test, time_train=None, time_test=None): obj = Dyer2012VNGPlusPlusClassifier() X_train, fields = dyer2012_tracestoInstances(obj, X_train, time_train) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) models1 = { 'Bernoulli': BernoulliNB(), 'Gaussian': GaussianNB(), 'Multinomial': MultinomialNB(), } params1 = { 'Bernoulli': {}, 'Gaussian': {}, 'Multinomial': {}, #'SVC': [ # {'kernel': ['linear'], 'C': [1, 10]}, # {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]}, #] } dyer_NB=MultinomialNB() dyer_NB.fit(X_train, y_train) del X_train #test X_test, fields = dyer2012_tracestoInstances(obj, X_test, time_test, fields) X_test = scaler.transform(X_test) predictions = dyer_NB.predict(X_test) del X_test labels = [] for l in y_train: if l not in labels: labels.append(l) return y_test, predictions
def xtraintestdata(self, datarray, yarray, dfiletowrite): x_train, x_test, y_train, y_test = train_test_split(datarray, yarray, test_size=0.2, random_state=1) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1) min_max_scaler = MinMaxScaler() # feed in a numpy array x_train_norm = min_max_scaler.fit_transform(x_train) _ = np.c_[x_train_norm, y_train] dirme = dfiletowrite sio.savemat(dirme, mdict={'UCIDat': yarray}) xy_valid = np.c_[x_val, y_val] xy_train = np.c_[x_train, y_train] xy_test = np.c_[x_test, y_test] return xy_train, xy_test, xy_valid
def load_data(n_samples, label_scaling: bool = False, method: str = 'maxmin'): """Take in Brian's data and spit out some numpy arrays for the PAL""" df_full_factorial_feat = pd.read_csv( os.path.join(DATADIR, 'new_features_full_random.csv'))[FEATURES].values a2 = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values deltaGMax = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values # pylint:disable=unused-variable gibbs = pd.read_csv(os.path.join( DATADIR, 'b1-b21_random_deltaG.csv'))['deltaGmin'].values * (-1) gibbs_max = pd.read_csv( os.path.join(DATADIR, 'b1-b21_random_virial_large_new.csv'))['deltaGmax'].values force_max = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_fit2.csv'))['F_repel_max'].values # pylint:disable=unused-variable rg = pd.read_csv(os.path.join(DATADIR, 'rg_results.csv'))['Rg'].values y = np.hstack( [rg.reshape(-1, 1), gibbs.reshape(-1, 1), gibbs_max.reshape(-1, 1)]) assert len(df_full_factorial_feat) == len(a2) == len(gibbs) == len(y) feat_scaler = StandardScaler() X = feat_scaler.fit_transform(df_full_factorial_feat) if label_scaling: label_scaler = MinMaxScaler() y = label_scaler.fit_transform(y) if method == 'maxmin': greedy_indices = get_maxmin_samples(X, n_samples) elif method == 'kmeans': greedy_indices = get_kmeans_samples(X, n_samples) return X, y, greedy_indices
def prepare_df_for_violinplot(df, feature_cols, class_col, class_indices=None, minmaxscale=True): """ Min-max-scale the data and then melt the dataframe into the long format """ if class_indices: df = df.loc[list(class_indices)] df = df[feature_cols + [class_col]] if minmaxscale: from sklearn.preprocessing.data import MinMaxScaler scaler = MinMaxScaler() df[feature_cols] = scaler.fit_transform(df[feature_cols]) prepared_df = pd.melt(df, value_vars=feature_cols, id_vars=class_col) return prepared_df
def classifier_panchenko2016(X_train, y_train, X_test, y_test, separateClassifier=False): train_or_test_labels = ["train" for i in y_train] + ["test" for i in y_test] y_train, X_train, y_test, X_test = outlier_removal(train_or_test_labels, X_train + X_test, y_train + y_test) y_train, X_train = features_extraction( y_train, X_train, separateClassifier=separateClassifier, featuresCount=100) y_test, X_test = features_extraction(y_test, X_test, separateClassifier=separateClassifier, featuresCount=100) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) classifier = SVC(kernel="rbf", C=2e11, gamma=2e-1, max_iter=5000, class_weight="balanced", verbose=1) print("fitting") classifier.fit(X_train, y_train) print("testing") y_predictions = classifier.predict(X_test) #, y_test) return y_test, y_predictions
print min_exercised_stock_options, max_exercised_stock_options features_list = [poi, feature_2, feature_1] data = featureFormat(data_dict, features_list, remove_any_zeroes=True) poi, finance_features = targetFeatureSplit(data) finance_features = numpy.reshape(numpy.array(finance_features), (len(finance_features), 2)) ##finance_features=sorted(finance_features,key=lambda x:x[0],reverse=True) print finance_features limit = len(finance_features) scaler = MinMaxScaler() ###salaries=numpy.array([finance_features[0],[1000000.],finance_features[limit-1]]) rescaled_weight = scaler.fit_transform(finance_features) print "The rescaled weight", rescaled_weight data_dict.pop("TOTAL", 0) ### the input features we want to use ### can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" feature_3 = "total_payments" poi = "poi" features_list = [poi, feature_1, feature_2, feature_3] data = featureFormat(data_dict, features_list, remove_any_zeroes=True) poi, finance_features = targetFeatureSplit(data) print "Last ", finance_features
from sklearn import datasets from sklearn.preprocessing.data import MinMaxScaler iris = datasets.load_iris() transformer = MinMaxScaler() newX = transformer.fit_transform(iris.data) print(iris.data) print('==============') print(newX)
import sklearn.preprocessing.data import numpy from sklearn.preprocessing.data import MinMaxScaler # Each element of the numpy array is a different training point # Each element within the training point is a feature # This example one feature - the weights feature # Three different training points # old_weights = numpy.array([[115], [140], [175]]) weights = numpy.array([[115.], [140.], [175.]]) # print("type(weights) - {}\n".format(type(weights))) # type(weights) - <class 'numpy.ndarray'> scaler = MinMaxScaler() # print("type(scaler) - {}\n".format(type(scaler))) # type(scaler) - <class 'sklearn.preprocessing.data.MinMaxScaler'> rescaled_weight = scaler.fit_transform(weights) # 1 of 2 steps fit - find x_min, x_max # 2 of 2 steps transform - applies the formula to the elements # print("rescaled_weight - ") # print(rescaled_weight) # rescaled_weight - # [[ 0. ] # [ 0.41666667] # [ 1. ]] print("type(rescaled_weight) - {}\n".format(type(rescaled_weight))) # type(rescaled_weight) - <class 'numpy.ndarray'>
print('................................xxxx{} - {}'.format( valueName, value)) if valueName == 'salary' and value != 'NaN' and value > 190000 and value < 210000: print('................................xxxx{} - {}'.format( valueName, value)) print() print("smallest exercised_stock_options - {}".format(smallestESO)) print("largest exercised_stock_options - {}".format(largestESO)) print("smallestSalary - {}".format(smallestSalary)) print("largestSalary - {}\n".format(largestSalary)) scaler = MinMaxScaler() original_salary = numpy.array([[smallestSalary], [200000.], [largestSalary]]) rescaled_salary = scaler.fit_transform(original_salary) print('rescaled_salary ->') print(rescaled_salary) original_exercised_stock_options = numpy.array([[smallestESO], [1000000.], [largestESO]]) rescaled_exercised_stock_options = scaler.fit_transform( original_exercised_stock_options) print('rescaled_exercised_stock_options ->') print(rescaled_exercised_stock_options) # class video - Introduction To Machine Learning - Clustering - Quiz: Clustering Features # What features will your clustering algorithm use? # answer - 1.) salary, 2.) exercised_stock_options ### the input features we want to use ### can be any key in the person-level dictionary (salary, director_fees, etc.)
def split_train_validation_test(multi_time_series_df, valid_start_time, test_start_time, features, time_step_lag=1, horizon=1, target='target', time_format='%Y-%m-%d %H:%M:%S', freq='H'): if not isinstance(features, list) or len(features) < 1: raise Exception( "Bad input for features. It must be an array of dataframe colummns used" ) train = multi_time_series_df.copy()[ multi_time_series_df.index < valid_start_time] train = train[features] X_scaler = MinMaxScaler() if 'load' in features: y_scaler = MinMaxScaler() y_scaler.fit(train[['load']]) else: y_scaler = MinMaxScaler() tg = train[target] y_scaler.fit(tg.values.reshape(-1, 1)) train[features] = X_scaler.fit_transform(train) tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)} train_inputs = TimeSeriesTensor(train, target=target, H=horizon, freq=freq, tensor_structure=tensor_structure) print(train_inputs.dataframe.head()) look_back_dt = dt.datetime.strptime( valid_start_time, time_format) - dt.timedelta(hours=time_step_lag - 1) valid = multi_time_series_df.copy()[ (multi_time_series_df.index >= look_back_dt) & (multi_time_series_df.index < test_start_time)] valid = valid[features] valid[features] = X_scaler.transform(valid) tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)} valid_inputs = TimeSeriesTensor(valid, target=target, H=horizon, freq=freq, tensor_structure=tensor_structure) print(valid_inputs.dataframe.head()) # test set # look_back_dt = dt.datetime.strptime(test_start_time, '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=time_step_lag - 1) test = multi_time_series_df.copy()[test_start_time:] test = test[features] test[features] = X_scaler.transform(test) test_inputs = TimeSeriesTensor(test, target=target, H=horizon, freq=freq, tensor_structure=tensor_structure) print("time lag:", time_step_lag, "original_feature:", len(features)) return train_inputs, valid_inputs, test_inputs, y_scaler
if dropnan: agg.dropna(inplace=True) return agg # load dataset dataset = read_csv('data/pollution.csv', header=0, index_col=0) values = dataset.values # integer encode direction encoder = LabelEncoder() values[:, 4] = encoder.fit_transform(values[:, 4]) # ensure all data is float values = values.astype('float32') # normalize features scaler = MinMaxScaler(feature_range=(0, 1)) scaled = scaler.fit_transform(values) # specify the number of lag hours n_hours = 1 n_features = 8 ### NOT USED YET # frame as supervised learning reframed = series_to_supervised(scaled, n_hours, 1) # drop columns we don't want to predict reframed.drop(reframed.columns[[9, 10, 11, 12, 13, 14, 15]], axis=1, inplace=True) print(reframed.head()) # split into train and test sets values = reframed.values n_train_hours = 365 * 24
def online_news_popularity_transform(input_path, features_path, labels_path, metadata_path): variables = [] types = {} values = {} for original_variable, original_type in ORIGINAL_TYPES.items(): if "_is_" in original_variable: index = original_variable.index("_is_") variable = original_variable[:index] value = original_variable[index + 4:] if variable not in types: assert variable not in values types[variable] = "categorical" if CAN_BE_EMPTY[variable]: values[variable] = ["none"] else: values[variable] = [] variables.append(variable) values[variable].append(value) else: variables.append(original_variable) types[original_variable] = original_type metadata = create_metadata(variables, types, values, NUM_SAMPLES) input_file = open(input_path, "r") reader = csv.DictReader(input_file) reader.fieldnames = [variable.strip() for variable in reader.fieldnames] features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.float32) # transform for i, row in enumerate(reader): # the categorical variables are already one hot encoded for j, variable in enumerate(metadata["variables"]): if types[variable] == "numerical": value = float(row[variable]) features[i, metadata["value_to_index"][variable]] = value elif types[variable] == "categorical": value = None for possible_value in values[variable]: if possible_value == "none": continue real_variable = "{}_is_{}".format(variable, possible_value) if read_binary(row[real_variable]) == 1: if value is None: value = possible_value else: raise Exception( "'{}' was already defined".format(variable)) if value is None: if "none" in values[variable]: value = "none" else: for possible_value in values[variable]: if possible_value == "none": continue real_variable = "{}_is_{}".format( variable, possible_value) print(possible_value, real_variable, read_binary(row[real_variable])) raise Exception( "'{}' has no valid value".format(variable)) features[i, metadata["value_to_index"][variable][value]] = 1.0 elif types[variable] == "binary": value = read_binary(row[variable]) assert value in [ 0, 1 ], "'{}' is not a valid value for '{}'".format( value, variable) features[i, metadata["value_to_index"][variable][value]] = 1.0 else: raise Exception("Unknown variable type.") labels[i] = row["shares"] # scale scaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler.fit_transform(features) label_scaler = MinMaxScaler(feature_range=(0, 1), copy=False) label_scaler.fit_transform(labels.reshape(-1, 1)) assert i == metadata["num_samples"] - 1 print("Total samples: ", features.shape[0]) print("Features: ", features.shape[1]) np.save(features_path, features) np.save(labels_path, labels) input_file.close() metadata["features_min"] = scaler.data_min_.tolist() metadata["features_max"] = scaler.data_max_.tolist() metadata["labels_min"] = label_scaler.data_min_.tolist() metadata["labels_max"] = label_scaler.data_max_.tolist() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)