def fit(self, data, args): self.model = MinMaxScaler() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval
def test_min_max_scaler_1d(): # Test scaling of dataset along single axis rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # Constant feature. X = np.zeros(5) scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_greater_equal(X_scaled.min(), 0.) assert_less_equal(X_scaled.max(), 1.)
def load_data(n_samples, label_scaling: bool = False): """Take in Brian's data and spit out some numpy arrays for the PAL""" df_full_factorial_feat = pd.read_csv( os.path.join(DATADIR, 'new_features_full_random.csv'))[FEATURES].values a2 = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values gibbs = pd.read_csv(os.path.join( DATADIR, 'b1-b21_random_deltaG.csv'))['deltaGmin'].values * (-1) gibbs_max = pd.read_csv( os.path.join(DATADIR, 'b1-b21_random_virial_large_new.csv'))['deltaGmax'].values rg = pd.read_csv(os.path.join(DATADIR, 'rg_results.csv'))['Rg'].values y = np.hstack( [rg.reshape(-1, 1), gibbs.reshape(-1, 1), gibbs_max.reshape(-1, 1)]) assert len(df_full_factorial_feat) == len(a2) == len(gibbs) == len(y) feat_scaler = StandardScaler() X = feat_scaler.fit_transform(df_full_factorial_feat) if label_scaling: label_scaler = MinMaxScaler() y = label_scaler.fit_transform(y) greedy_indices = get_maxmin_samples(X, n_samples) nan_indices = np.unique(np.random.randint(0, len(y) - 1, int(len(y) / 3))) y[nan_indices, 2] = np.nan return X, y, greedy_indices
def test_min_max_scaler_iris(): X = iris.data scaler = MinMaxScaler() # default params X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.max(axis=0), 1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # not default params: min=1, max=2 scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 1) assert_array_almost_equal(X_trans.max(axis=0), 2) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # min=-.5, max=.6 scaler = MinMaxScaler(feature_range=(-.5, .6)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), -.5) assert_array_almost_equal(X_trans.max(axis=0), .6) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # raises on invalid range scaler = MinMaxScaler(feature_range=(2, 1)) assert_raises(ValueError, scaler.fit, X)
def parkinsons_replicated_data(self, park_dat): df_parkinson = pd.read_csv(park_dat, sep=',') ylabel = df_parkinson['Status'] xfeatures = df_parkinson.drop(['Status', 'ID'], axis=1) xfeats = df_parkinson.drop(['Status', 'ID'], axis=1).values x = (xfeats - np.min(xfeats)) / (np.max(xfeats) - np.min(xfeats)) y = df_parkinson['Status'].values xfeatsp = pd.DataFrame(xfeatures) minmax_scaling = MinMaxScaler() x_scaledp = minmax_scaling.fit_transform(xfeatsp) x_scaledp = pd.DataFrame(x_scaledp) f1 = plt.figure(figsize=(19, 16)) plt.matshow(x_scaledp.corr(), fignum=f1.number) plt.xticks(range(x_scaledp.shape[1]), x_scaledp.columns, fontsize=10, rotation=45) plt.xticks(range(x_scaledp.shape[1]), x_scaledp.columns, fontsize=10) cb = plt.colorbar() cb.ax.tick_params(labelsize=12) plt.show() for eachx in xfeatures: xfeatures[eachx] = (xfeatures[eachx] - xfeatures[eachx].min() ) / xfeatures[eachx].max() ylabel = ylabel.values # ydata = ylabel[:, None] xdata = x_scaledp.to_numpy() targets = np.array(ylabel).reshape(-1) y = np.eye(2)[targets] xtrain, xtest, y_train, y_test = train_test_split( x, y, test_size=0.30) #, shuffle=False) print(y_test) #y_train = ytrain[:, None] #y_test = ytest[:, None] return xtrain, xtest, y_train, y_test
def process_data(): file_path = '/Users/fpena/Stuff/House Search/Dublin/viewings-ucd.csv' data_frame = pandas.read_csv(file_path) print(data_frame.columns.values.tolist()) print(data_frame.head()) print(data_frame.describe()) print(data_frame['Price']) price_scaler = MinMaxScaler() data_frame['Price Score'] = 1 - price_scaler.fit_transform( data_frame[['Price']]) data_frame['Cycle Time Score'] = 1 - price_scaler.fit_transform( data_frame[['Cycle Time']]) data_frame['Score'] = 0.5 * (data_frame['Price Score'] + data_frame['Cycle Time Score']) data_frame['Rank'] = data_frame['Score'].rank( ascending=True) / (len(data_frame)) cycle_hour_cost = 30 working_days_per_month = 22 data_frame['Money Score'] =\ data_frame['Price'] + data_frame['Cycle Time'] / 60 * cycle_hour_cost * working_days_per_month data_frame.rename(columns={'Cycle Time': 'Cycle'}, inplace=True) # print(data_frame['Price Score']) # print(data_frame[['Score', 'Rank']]) # with pandas.option_context('display.max_rows', 500, 'display.max_columns', 10): # print(data_frame[['Address', 'Price', 'Cycle', 'Rank', 'Score']].sort_values('Rank', ascending=False)) # print(data_frame[['Address', 'Price', 'Cycle', 'Rank', 'Score', 'Money Score']].to_string()) print(data_frame[[ 'Address', 'Price', 'Cycle', 'Rank', 'Score', 'Money Score' ]].sort_values('Rank', ascending=False).to_string()) # seaborn.(x='Price', y='Cycle Time', data_frame=data_frame) data_frame.plot.scatter(x='Price', y='Cycle') pyplot.savefig('/tmp/daft_scatter.pdf') pyplot.cla() pyplot.clf() data_frame.plot.scatter(x='Price Score', y='Cycle Time Score') pyplot.savefig('/tmp/daft_scatter_norm.pdf') pyplot.cla() pyplot.clf() seaborn.stripplot(x='Accommodation Type', y='Price', data=data_frame, jitter=True) pyplot.savefig('/tmp/daft_price.pdf') pyplot.cla() pyplot.clf() data_frame.plot.scatter(x='Housemates', y='Price') pyplot.savefig('/tmp/daft_scatter_price_housemates.pdf') pyplot.cla() pyplot.clf() data_frame.to_csv('/tmp/daft-houses-processed.csv')
def spambase_transform(input_path, features_path, labels_path, metadata_path): metadata = create_metadata( VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {}, sum(NUM_SAMPLES), CLASSES) input_file = open(input_path, "r") features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform i = 0 line = input_file.readline() while line != "": line = line.rstrip("\n") values = line.split(",") assert len(values) - 1 == len(VARIABLES), str( (len(values) - 1, len(VARIABLES))) for j, value in enumerate(values[:-1]): value = float(value) features[i, j] = value labels[i] = int(values[-1]) i += 1 line = input_file.readline() # scale scaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler.fit_transform(features) assert i == metadata["num_samples"] num_positive_samples = int(labels.sum()) num_negative_samples = labels.shape[0] - num_positive_samples assert num_negative_samples == NUM_SAMPLES[0] assert num_positive_samples == NUM_SAMPLES[1] print("Negative samples: ", num_negative_samples) print("Positive samples: ", num_positive_samples) print("Total samples: ", features.shape[0]) print("Features: ", features.shape[1]) np.save(features_path, features) np.save(labels_path, labels) input_file.close() metadata["features_min"] = scaler.data_min_.tolist() metadata["features_max"] = scaler.data_max_.tolist() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def pearson(A, B, scale=True): correlation = 0 if scale: scaler = MinMaxScaler() A = scaler.fit_transform(A) B = scaler.fit_transform(B) for i in range(A.shape[1]): correlation = correlation + pearsonr(A[:, i], B[:, i])[0] return correlation / A.shape[1]
def letter_recognition_transform(input_path, features_path, labels_path, metadata_path): metadata = create_metadata( VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {}, sum(NUM_SAMPLES), CLASSES) input_file = open(input_path, "r") features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform i = 0 line = input_file.readline() while line != "": line = line.rstrip("\n") values = line.split(",") assert len(values) - 1 == len(VARIABLES), str( (len(values) - 1, len(VARIABLES))) for j, value in enumerate(values[1:]): value = float(value) features[i, j] = value labels[i] = CLASS_TO_INDEX[values[0]] i += 1 line = input_file.readline() # scale scaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler.fit_transform(features) assert i == metadata["num_samples"] for class_index in range(len(NUM_SAMPLES)): num_samples_class = (labels == class_index).sum() assert num_samples_class == NUM_SAMPLES[class_index] print("Total samples: ", features.shape[0]) print("Features: ", features.shape[1]) np.save(features_path, features) np.save(labels_path, labels) input_file.close() metadata["features_min"] = scaler.data_min_.tolist() metadata["features_max"] = scaler.data_max_.tolist() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def default_credit_card_transform(input_path, features_path, labels_path, metadata_path): input_file = open(input_path, "r") reader = csv.DictReader(input_file) variables = set(reader.fieldnames) variables.remove("ID") variables.remove("default payment next month") metadata = create_metadata(variables, TYPES, VALUES, NUM_SAMPLES, CLASSES) features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform for i, row in enumerate(reader): # the categorical variables are already one hot encoded for j, variable in enumerate(metadata["variables"]): value = row[variable] if TYPES[variable] == "numerical": value = float(value) features[i, metadata["value_to_index"][variable]] = value elif TYPES[variable] == "categorical": value = value.replace(".0", "") assert value in VALUES[variable], \ "'{}' is not a valid value for '{}'".format(value, variable) features[i, metadata["value_to_index"][variable][value]] = 1.0 # the class needs to be transformed labels[i] = int(row["default payment next month"].replace(".0", "")) # scale scaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler.fit_transform(features) assert i == metadata["num_samples"] - 1 num_positive_samples = int(labels.sum()) num_negative_samples = labels.shape[0] - num_positive_samples print("Negative samples: ", num_negative_samples) print("Positive samples: ", num_positive_samples) print("Total samples: ", features.shape[0]) print("Features: ", features.shape[1]) np.save(features_path, features) np.save(labels_path, labels) input_file.close() metadata["features_min"] = scaler.data_min_.tolist() metadata["features_max"] = scaler.data_max_.tolist() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def test_min_max_scaler_zero_variance_features(): # Check min max scaler on toy data with zero variance features X = [[0., 1., +0.5], [0., 1., -0.1], [0., 1., +1.1]] X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]] # default params scaler = MinMaxScaler() X_trans = scaler.fit_transform(X) X_expected_0_1 = [[0., 0., 0.5], [0., 0., 0.0], [0., 0., 1.0]] assert_array_almost_equal(X_trans, X_expected_0_1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) X_trans_new = scaler.transform(X_new) X_expected_0_1_new = [[+0., 1., 0.500], [-1., 0., 0.083], [+0., 0., 1.333]] assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) # not default params scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) X_expected_1_2 = [[1., 1., 1.5], [1., 1., 1.0], [1., 1., 2.0]] assert_array_almost_equal(X_trans, X_expected_1_2) # function interface X_trans = minmax_scale(X) assert_array_almost_equal(X_trans, X_expected_0_1) X_trans = minmax_scale(X, feature_range=(1, 2)) assert_array_almost_equal(X_trans, X_expected_1_2)
class MinMaxScalerImpl(): def __init__(self, feature_range=(0, 1), copy=True): self._hyperparams = {'feature_range': feature_range, 'copy': copy} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def transform(self, X): return self._sklearn_model.transform(X)
class Scaler(TransformerMixin): def __init__(self): self._scaler = MinMaxScaler(feature_range=(-1, 1)) def transform(self, df, *_): assert_all_finite(df) scaled = self._scaler.transform(df) df = pd.DataFrame(scaled, columns=df.columns) assert_all_finite(df) return df def fit(self, df, *_): self._scaler.fit(df) return self
def classify(X_train, y_train, X_test, y_test): scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) liberatore_NB = GaussianNB() liberatore_NB.fit(X_train, y_train) del X_train X_test = scaler.transform(X_test) predictions = liberatore_NB.predict(X_test) return y_test, predictions
def correct_values(values, min_value, max_value): ''' Ensures that values are in given range @param values: 1d numpy array ''' # scale # do nothing if valid values lowest_val = np.min(values) largest_val = np.max(values) lowest_val_valid = lowest_val >= min_value and lowest_val < max_value largest_val_valid = largest_val <= max_value and largest_val > min_value #print("allowed: min_val: ", min_value, " max_val: ", max_value) #print("current: min_val: ", lowest_val, "max_val: ", largest_val) if lowest_val_valid and largest_val_valid: pass else: #print("at least one not valid") # +/-1 to prevent AssertionErrors caused by rounding errors # -> +/-1 introduces new excpetion: "ValueError: Minimum of desired # feature range must be smaller than maximum. Got (84.80001171045868, # 84). -> Therefore used without +-1 and adapted assertions. min_value_for_scaler = min_value # + 1 max_value_for_scaler = max_value # - 1 # re-use max/min values in data if valid, otherwise all functions would # be in same range if lowest_val_valid: #print("lowest valid") min_value_for_scaler = lowest_val if largest_val_valid: #print("largest valid") max_value_for_scaler = largest_val scaler = MinMaxScaler(feature_range=( min_value_for_scaler, max_value_for_scaler)) reshaped_values = values.reshape(-1, 1) # otherwise DeprecationWarning scaler = scaler.fit(reshaped_values) values = scaler.transform(reshaped_values) values = np.reshape(values, len(values)) # original shape # print("afterwards: min_val: ", np.min( # values), " max_val: ", np.max(values)) min_in_scaled = np.min(values) max_in_scaled = np.max(values) # test whether min_value <= min_in_scaled assert min_value - min_in_scaled <= 0.0000001, "current min: " + \ str(min_in_scaled) + "but allowed min is: " + str(min_value) # test wheter max_in_scaled <= max_value assert max_in_scaled - max_value <= 0.000001, "current max: " + str(max_in_scaled) + \ " but allowed max is: " + str(max_value) return values
def get_pipeline(features): feature_names = [] for feature in features: feature_names += feature[1].FEATS print(feature_names) return Pipeline(features + [('transform', ToMatrix( features=feature_names)), ('norm', MinMaxScaler())])
def run(test, train): # generate features # feats = get_serialized_pipeline(train) feats = get_serialized_pipeline(train) train_x = feats.fit_transform(train) test_x = feats.fit_transform(test) # train train_y = [1 if sent.label_test > 0 else 0 for sent in train] clf = MLPClassifier(max_iter=300, solver='sgd', alpha=4, hidden_layer_sizes=(200, 50), random_state=42, activation='relu', learning_rate_init=0.04, batch_size=550) clf.fit(train_x, train_y) # predict predictions = clf.predict(test_x) pred_probs = clf.predict_proba(test_x) pred_probs = MinMaxScaler().fit_transform(np.reshape([pred[1] for pred in pred_probs], (-1, 1))).tolist() for i, sent in enumerate(test): sent.pred_label = predictions[i] sent.pred = pred_probs[i] return test
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def _create_scaler(self, positivity): self.scaler_positivity = positivity if positivity is True: eps = 1e-9 self._scaler = MinMaxScaler(feature_range=(eps, 1)) else: self._scaler = StandardScaler() self.scaler_is_fitted = False
def test_min_max_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # Constant feature. X = np.zeros(5) scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_greater_equal(X_scaled.min(), 0.) assert_less_equal(X_scaled.max(), 1.)
def test_warning_scaling_integers(): # Check warning when scaling integer data X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) w = "Data with input dtype uint8 was converted to float64" clean_warning_registry() assert_warns_message(DataConversionWarning, w, scale, X) assert_warns_message(DataConversionWarning, w, StandardScaler().fit, X) assert_warns_message(DataConversionWarning, w, MinMaxScaler().fit, X)
def test_warning_scaling_integers(): """Check warning when scaling integer data""" X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) w = "assumes floating point values as input, got uint8" clean_warning_registry() assert_warns_message(UserWarning, w, scale, X) assert_warns_message(UserWarning, w, StandardScaler().fit, X) assert_warns_message(UserWarning, w, MinMaxScaler().fit, X)
def classifier_dyer2012(X_train, y_train, X_test, y_test, time_train=None, time_test=None): obj = Dyer2012VNGPlusPlusClassifier() X_train, fields = dyer2012_tracestoInstances(obj, X_train, time_train) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) models1 = { 'Bernoulli': BernoulliNB(), 'Gaussian': GaussianNB(), 'Multinomial': MultinomialNB(), } params1 = { 'Bernoulli': {}, 'Gaussian': {}, 'Multinomial': {}, #'SVC': [ # {'kernel': ['linear'], 'C': [1, 10]}, # {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]}, #] } dyer_NB=MultinomialNB() dyer_NB.fit(X_train, y_train) del X_train #test X_test, fields = dyer2012_tracestoInstances(obj, X_test, time_test, fields) X_test = scaler.transform(X_test) predictions = dyer_NB.predict(X_test) del X_test labels = [] for l in y_train: if l not in labels: labels.append(l) return y_test, predictions
def xtraintestdata(self, datarray, yarray, dfiletowrite): x_train, x_test, y_train, y_test = train_test_split(datarray, yarray, test_size=0.2, random_state=1) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1) min_max_scaler = MinMaxScaler() # feed in a numpy array x_train_norm = min_max_scaler.fit_transform(x_train) _ = np.c_[x_train_norm, y_train] dirme = dfiletowrite sio.savemat(dirme, mdict={'UCIDat': yarray}) xy_valid = np.c_[x_val, y_val] xy_train = np.c_[x_train, y_train] xy_test = np.c_[x_test, y_test] return xy_train, xy_test, xy_valid
def load_data(n_samples, label_scaling: bool = False, method: str = 'maxmin'): """Take in Brian's data and spit out some numpy arrays for the PAL""" df_full_factorial_feat = pd.read_csv( os.path.join(DATADIR, 'new_features_full_random.csv'))[FEATURES].values a2 = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values deltaGMax = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values # pylint:disable=unused-variable gibbs = pd.read_csv(os.path.join( DATADIR, 'b1-b21_random_deltaG.csv'))['deltaGmin'].values * (-1) gibbs_max = pd.read_csv( os.path.join(DATADIR, 'b1-b21_random_virial_large_new.csv'))['deltaGmax'].values force_max = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_fit2.csv'))['F_repel_max'].values # pylint:disable=unused-variable rg = pd.read_csv(os.path.join(DATADIR, 'rg_results.csv'))['Rg'].values y = np.hstack( [rg.reshape(-1, 1), gibbs.reshape(-1, 1), gibbs_max.reshape(-1, 1)]) assert len(df_full_factorial_feat) == len(a2) == len(gibbs) == len(y) feat_scaler = StandardScaler() X = feat_scaler.fit_transform(df_full_factorial_feat) if label_scaling: label_scaler = MinMaxScaler() y = label_scaler.fit_transform(y) if method == 'maxmin': greedy_indices = get_maxmin_samples(X, n_samples) elif method == 'kmeans': greedy_indices = get_kmeans_samples(X, n_samples) return X, y, greedy_indices
def test_warning_scaling_integers(): """Check warning when scaling integer data""" X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) with warnings.catch_warnings(record=True): warnings.simplefilter("always") assert_warns(UserWarning, StandardScaler().fit, X) with warnings.catch_warnings(record=True): warnings.simplefilter("always") assert_warns(UserWarning, MinMaxScaler().fit, X)
class CreateMinMaxScaler(CreateModel): def fit(self, data, args): self.model = MinMaxScaler() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval def test(self, data): assert self.model is not None return self.model.transform(data.X_test) def predict(self, data): with Timer() as t: self.predictions = self.test(data) data.learning_task = LearningTask.REGRESSION return t.interval
def read_svm_pred(test_sent, input_file): input = open(input_file) ranks = [] for line in input: ranks.append(float(line.strip())) ranks = MinMaxScaler().fit_transform(ranks) for i, sent in enumerate(test_sent): test_sent[i].pred = ranks[i] test_sent[i].pred_label = 1 if ranks[i] >= 0.5 else 0 return test_sent
def prepare_df_for_violinplot(df, feature_cols, class_col, class_indices=None, minmaxscale=True): """ Min-max-scale the data and then melt the dataframe into the long format """ if class_indices: df = df.loc[list(class_indices)] df = df[feature_cols + [class_col]] if minmaxscale: from sklearn.preprocessing.data import MinMaxScaler scaler = MinMaxScaler() df[feature_cols] = scaler.fit_transform(df[feature_cols]) prepared_df = pd.melt(df, value_vars=feature_cols, id_vars=class_col) return prepared_df
def make_models(X, y, y_bin): return dict(ols=LinearRegression().fit(X, y), lr_bin=LogisticRegression().fit(X, y_bin), lr_ovr=LogisticRegression(multi_class='ovr').fit(X, y), lr_mn=LogisticRegression(solver='lbfgs', multi_class='multinomial').fit(X, y), svc=SVC(kernel='linear').fit(X, y_bin), svr=SVR(kernel='linear').fit(X, y), dtc=DecisionTreeClassifier(max_depth=4).fit(X, y), dtr=DecisionTreeRegressor(max_depth=4).fit(X, y), rfc=RandomForestClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), rfr=RandomForestRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbc=GradientBoostingClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbr=GradientBoostingRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), abc=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y), abc2=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y), abc3=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y_bin), abc4=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y_bin), km=KMeans(1).fit(X), km2=KMeans(5).fit(X), pc1=PCA(1).fit(X), pc2=PCA(2).fit(X), pc3=PCA(2, whiten=True).fit(X), mlr1=MLPRegressor([2], 'relu').fit(X, y), mlr2=MLPRegressor([2, 1], 'tanh').fit(X, y), mlr3=MLPRegressor([2, 2, 2], 'identity').fit(X, y), mlc=MLPClassifier([2, 2], 'tanh').fit(X, y), mlc_bin=MLPClassifier([2, 2], 'identity').fit(X, y_bin), bin=Binarizer(0.5), mms=MinMaxScaler().fit(X), mas=MaxAbsScaler().fit(X), ss1=StandardScaler().fit(X), ss2=StandardScaler(with_mean=False).fit(X), ss3=StandardScaler(with_std=False).fit(X), n1=Normalizer('l1'), n2=Normalizer('l2'), n3=Normalizer('max'))
def test_min_max_scaler_zero_variance_features(): """Check min max scaler on toy data with zero variance features""" X = [[0., 1., +0.5], [0., 1., -0.1], [0., 1., +1.1]] X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]] # default params scaler = MinMaxScaler() X_trans = scaler.fit_transform(X) X_expected_0_1 = [[0., 0., 0.5], [0., 0., 0.0], [0., 0., 1.0]] assert_array_almost_equal(X_trans, X_expected_0_1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) X_trans_new = scaler.transform(X_new) X_expected_0_1_new = [[+0., 1., 0.500], [-1., 0., 0.083], [+0., 0., 1.333]] assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) # not default params scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) X_expected_1_2 = [[1., 1., 1.5], [1., 1., 1.0], [1., 1., 2.0]] assert_array_almost_equal(X_trans, X_expected_1_2)
def classifier_panchenko2016(X_train, y_train, X_test, y_test, separateClassifier=False): train_or_test_labels = ["train" for i in y_train] + ["test" for i in y_test] y_train, X_train, y_test, X_test = outlier_removal(train_or_test_labels, X_train + X_test, y_train + y_test) y_train, X_train = features_extraction( y_train, X_train, separateClassifier=separateClassifier, featuresCount=100) y_test, X_test = features_extraction(y_test, X_test, separateClassifier=separateClassifier, featuresCount=100) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) classifier = SVC(kernel="rbf", C=2e11, gamma=2e-1, max_iter=5000, class_weight="balanced", verbose=1) print("fitting") classifier.fit(X_train, y_train) print("testing") y_predictions = classifier.predict(X_test) #, y_test) return y_test, y_predictions
def get_serialized_pipeline(train): from src.features import counting_feat, knn_similarity config = get_config() feature_names = [ file_name for file_name in listdir(config['features_dump_dir']) ] return Pipeline([ ('read', ReadFeatures(feature_names)), ("train_search", knn_similarity.TrainSearch(train=train)), ('tfidf', counting_feat.BagOfTfIDF(train)), # cb ('transform', ToMatrix(features=feature_names)), ('norm', MinMaxScaler()) ])