def test_min_max_scaler_iris(): X = iris.data scaler = MinMaxScaler() # default params X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.max(axis=0), 1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # not default params: min=1, max=2 scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 1) assert_array_almost_equal(X_trans.max(axis=0), 2) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # min=-.5, max=.6 scaler = MinMaxScaler(feature_range=(-.5, .6)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), -.5) assert_array_almost_equal(X_trans.max(axis=0), .6) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # raises on invalid range scaler = MinMaxScaler(feature_range=(2, 1)) assert_raises(ValueError, scaler.fit, X)
def test_min_max_scaler_1d(): # Test scaling of dataset along single axis rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # Constant feature. X = np.zeros(5) scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_greater_equal(X_scaled.min(), 0.) assert_less_equal(X_scaled.max(), 1.)
def test_min_max_scaler_zero_variance_features(): # Check min max scaler on toy data with zero variance features X = [[0., 1., +0.5], [0., 1., -0.1], [0., 1., +1.1]] X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]] # default params scaler = MinMaxScaler() X_trans = scaler.fit_transform(X) X_expected_0_1 = [[0., 0., 0.5], [0., 0., 0.0], [0., 0., 1.0]] assert_array_almost_equal(X_trans, X_expected_0_1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) X_trans_new = scaler.transform(X_new) X_expected_0_1_new = [[+0., 1., 0.500], [-1., 0., 0.083], [+0., 0., 1.333]] assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) # not default params scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) X_expected_1_2 = [[1., 1., 1.5], [1., 1., 1.0], [1., 1., 2.0]] assert_array_almost_equal(X_trans, X_expected_1_2) # function interface X_trans = minmax_scale(X) assert_array_almost_equal(X_trans, X_expected_0_1) X_trans = minmax_scale(X, feature_range=(1, 2)) assert_array_almost_equal(X_trans, X_expected_1_2)
def get_pipeline(features): feature_names = [] for feature in features: feature_names += feature[1].FEATS print(feature_names) return Pipeline(features + [('transform', ToMatrix( features=feature_names)), ('norm', MinMaxScaler())])
def run(test, train): # generate features # feats = get_serialized_pipeline(train) feats = get_serialized_pipeline(train) train_x = feats.fit_transform(train) test_x = feats.fit_transform(test) # train train_y = [1 if sent.label_test > 0 else 0 for sent in train] clf = MLPClassifier(max_iter=300, solver='sgd', alpha=4, hidden_layer_sizes=(200, 50), random_state=42, activation='relu', learning_rate_init=0.04, batch_size=550) clf.fit(train_x, train_y) # predict predictions = clf.predict(test_x) pred_probs = clf.predict_proba(test_x) pred_probs = MinMaxScaler().fit_transform(np.reshape([pred[1] for pred in pred_probs], (-1, 1))).tolist() for i, sent in enumerate(test): sent.pred_label = predictions[i] sent.pred = pred_probs[i] return test
def fit(self, data, args): self.model = MinMaxScaler() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval
def load_data(n_samples, label_scaling: bool = False): """Take in Brian's data and spit out some numpy arrays for the PAL""" df_full_factorial_feat = pd.read_csv( os.path.join(DATADIR, 'new_features_full_random.csv'))[FEATURES].values a2 = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values gibbs = pd.read_csv(os.path.join( DATADIR, 'b1-b21_random_deltaG.csv'))['deltaGmin'].values * (-1) gibbs_max = pd.read_csv( os.path.join(DATADIR, 'b1-b21_random_virial_large_new.csv'))['deltaGmax'].values rg = pd.read_csv(os.path.join(DATADIR, 'rg_results.csv'))['Rg'].values y = np.hstack( [rg.reshape(-1, 1), gibbs.reshape(-1, 1), gibbs_max.reshape(-1, 1)]) assert len(df_full_factorial_feat) == len(a2) == len(gibbs) == len(y) feat_scaler = StandardScaler() X = feat_scaler.fit_transform(df_full_factorial_feat) if label_scaling: label_scaler = MinMaxScaler() y = label_scaler.fit_transform(y) greedy_indices = get_maxmin_samples(X, n_samples) nan_indices = np.unique(np.random.randint(0, len(y) - 1, int(len(y) / 3))) y[nan_indices, 2] = np.nan return X, y, greedy_indices
def parkinsons_replicated_data(self, park_dat): df_parkinson = pd.read_csv(park_dat, sep=',') ylabel = df_parkinson['Status'] xfeatures = df_parkinson.drop(['Status', 'ID'], axis=1) xfeats = df_parkinson.drop(['Status', 'ID'], axis=1).values x = (xfeats - np.min(xfeats)) / (np.max(xfeats) - np.min(xfeats)) y = df_parkinson['Status'].values xfeatsp = pd.DataFrame(xfeatures) minmax_scaling = MinMaxScaler() x_scaledp = minmax_scaling.fit_transform(xfeatsp) x_scaledp = pd.DataFrame(x_scaledp) f1 = plt.figure(figsize=(19, 16)) plt.matshow(x_scaledp.corr(), fignum=f1.number) plt.xticks(range(x_scaledp.shape[1]), x_scaledp.columns, fontsize=10, rotation=45) plt.xticks(range(x_scaledp.shape[1]), x_scaledp.columns, fontsize=10) cb = plt.colorbar() cb.ax.tick_params(labelsize=12) plt.show() for eachx in xfeatures: xfeatures[eachx] = (xfeatures[eachx] - xfeatures[eachx].min() ) / xfeatures[eachx].max() ylabel = ylabel.values # ydata = ylabel[:, None] xdata = x_scaledp.to_numpy() targets = np.array(ylabel).reshape(-1) y = np.eye(2)[targets] xtrain, xtest, y_train, y_test = train_test_split( x, y, test_size=0.30) #, shuffle=False) print(y_test) #y_train = ytrain[:, None] #y_test = ytest[:, None] return xtrain, xtest, y_train, y_test
def _create_scaler(self, positivity): self.scaler_positivity = positivity if positivity is True: eps = 1e-9 self._scaler = MinMaxScaler(feature_range=(eps, 1)) else: self._scaler = StandardScaler() self.scaler_is_fitted = False
def process_data(): file_path = '/Users/fpena/Stuff/House Search/Dublin/viewings-ucd.csv' data_frame = pandas.read_csv(file_path) print(data_frame.columns.values.tolist()) print(data_frame.head()) print(data_frame.describe()) print(data_frame['Price']) price_scaler = MinMaxScaler() data_frame['Price Score'] = 1 - price_scaler.fit_transform( data_frame[['Price']]) data_frame['Cycle Time Score'] = 1 - price_scaler.fit_transform( data_frame[['Cycle Time']]) data_frame['Score'] = 0.5 * (data_frame['Price Score'] + data_frame['Cycle Time Score']) data_frame['Rank'] = data_frame['Score'].rank( ascending=True) / (len(data_frame)) cycle_hour_cost = 30 working_days_per_month = 22 data_frame['Money Score'] =\ data_frame['Price'] + data_frame['Cycle Time'] / 60 * cycle_hour_cost * working_days_per_month data_frame.rename(columns={'Cycle Time': 'Cycle'}, inplace=True) # print(data_frame['Price Score']) # print(data_frame[['Score', 'Rank']]) # with pandas.option_context('display.max_rows', 500, 'display.max_columns', 10): # print(data_frame[['Address', 'Price', 'Cycle', 'Rank', 'Score']].sort_values('Rank', ascending=False)) # print(data_frame[['Address', 'Price', 'Cycle', 'Rank', 'Score', 'Money Score']].to_string()) print(data_frame[[ 'Address', 'Price', 'Cycle', 'Rank', 'Score', 'Money Score' ]].sort_values('Rank', ascending=False).to_string()) # seaborn.(x='Price', y='Cycle Time', data_frame=data_frame) data_frame.plot.scatter(x='Price', y='Cycle') pyplot.savefig('/tmp/daft_scatter.pdf') pyplot.cla() pyplot.clf() data_frame.plot.scatter(x='Price Score', y='Cycle Time Score') pyplot.savefig('/tmp/daft_scatter_norm.pdf') pyplot.cla() pyplot.clf() seaborn.stripplot(x='Accommodation Type', y='Price', data=data_frame, jitter=True) pyplot.savefig('/tmp/daft_price.pdf') pyplot.cla() pyplot.clf() data_frame.plot.scatter(x='Housemates', y='Price') pyplot.savefig('/tmp/daft_scatter_price_housemates.pdf') pyplot.cla() pyplot.clf() data_frame.to_csv('/tmp/daft-houses-processed.csv')
def spambase_transform(input_path, features_path, labels_path, metadata_path): metadata = create_metadata( VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {}, sum(NUM_SAMPLES), CLASSES) input_file = open(input_path, "r") features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform i = 0 line = input_file.readline() while line != "": line = line.rstrip("\n") values = line.split(",") assert len(values) - 1 == len(VARIABLES), str( (len(values) - 1, len(VARIABLES))) for j, value in enumerate(values[:-1]): value = float(value) features[i, j] = value labels[i] = int(values[-1]) i += 1 line = input_file.readline() # scale scaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler.fit_transform(features) assert i == metadata["num_samples"] num_positive_samples = int(labels.sum()) num_negative_samples = labels.shape[0] - num_positive_samples assert num_negative_samples == NUM_SAMPLES[0] assert num_positive_samples == NUM_SAMPLES[1] print("Negative samples: ", num_negative_samples) print("Positive samples: ", num_positive_samples) print("Total samples: ", features.shape[0]) print("Features: ", features.shape[1]) np.save(features_path, features) np.save(labels_path, labels) input_file.close() metadata["features_min"] = scaler.data_min_.tolist() metadata["features_max"] = scaler.data_max_.tolist() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def learn(examples, Classifier, classifierArgs, develFolds=10, verbose=3, n_jobs=1, predKey="ml_comb_pred", limitTerms=None): print "Parameter grid search" develExamples = getSubset(examples, ["devel"]) clf = GridSearchCV(Classifier(), classifierArgs, cv=develFolds, verbose=verbose, n_jobs=n_jobs, scoring="f1_micro") clf.fit(develExamples["features"], develExamples["classes"]) print "Best params", (clf.best_params_, clf.best_score_) print "Predicting all examples" minMax = MinMaxScaler((0.03, 1.0)) allPredictions = clf.predict(examples["features"]) if hasattr(clf, "predict_proba"): allProbabilities = clf.predict_proba(examples["features"]) else: allProbabilities = clf.decision_function(examples["features"]) #import pdb; pdb.set_trace() minMax.fit( allProbabilities) #minmax_scale(testProbabilities, (0.03, 1.0)) allProbabilities = minMax.transform( allProbabilities ) #allProbabilities = minmax_scale(allProbabilities, (0.03, 1.0)) print "Predicting the test set" testExamples = getSubset(examples, ["test"]) testPredictions = clf.predict(testExamples["features"]) if hasattr(clf, "predict_proba"): testProbabilities = clf.predict_proba(testExamples["features"]) else: testProbabilities = clf.decision_function(testExamples["features"]) testProbabilities = minMax.transform(testProbabilities) binaryToMultiLabel(testExamples, testPredictions, testProbabilities, predKey) print "Evaluating test set ensemble predictions" testProteins = {x["id"]: x for x in testExamples["proteins"]} multiLabelTestExamples = evaluateFile.makeExamples(testProteins, limitTerms=limitTerms, limitToSets=["test"], predKey=predKey) loading.vectorizeExamples(multiLabelTestExamples, None, sparseLabels=True) results = evaluation.evaluate(multiLabelTestExamples["labels"], multiLabelTestExamples["predictions"], multiLabelTestExamples, terms=None, averageOnly=True, noAUC=True) print "Average for test set:", evaluation.metricsToString( results["average"]) binaryToMultiLabel(examples, allPredictions, allProbabilities, predKey)
def pearson(A, B, scale=True): correlation = 0 if scale: scaler = MinMaxScaler() A = scaler.fit_transform(A) B = scaler.fit_transform(B) for i in range(A.shape[1]): correlation = correlation + pearsonr(A[:, i], B[:, i])[0] return correlation / A.shape[1]
def test_warning_scaling_integers(): """Check warning when scaling integer data""" X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) w = "assumes floating point values as input, got uint8" clean_warning_registry() assert_warns_message(UserWarning, w, scale, X) assert_warns_message(UserWarning, w, StandardScaler().fit, X) assert_warns_message(UserWarning, w, MinMaxScaler().fit, X)
def test_warning_scaling_integers(): # Check warning when scaling integer data X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) w = "Data with input dtype uint8 was converted to float64" clean_warning_registry() assert_warns_message(DataConversionWarning, w, scale, X) assert_warns_message(DataConversionWarning, w, StandardScaler().fit, X) assert_warns_message(DataConversionWarning, w, MinMaxScaler().fit, X)
def letter_recognition_transform(input_path, features_path, labels_path, metadata_path): metadata = create_metadata( VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {}, sum(NUM_SAMPLES), CLASSES) input_file = open(input_path, "r") features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform i = 0 line = input_file.readline() while line != "": line = line.rstrip("\n") values = line.split(",") assert len(values) - 1 == len(VARIABLES), str( (len(values) - 1, len(VARIABLES))) for j, value in enumerate(values[1:]): value = float(value) features[i, j] = value labels[i] = CLASS_TO_INDEX[values[0]] i += 1 line = input_file.readline() # scale scaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler.fit_transform(features) assert i == metadata["num_samples"] for class_index in range(len(NUM_SAMPLES)): num_samples_class = (labels == class_index).sum() assert num_samples_class == NUM_SAMPLES[class_index] print("Total samples: ", features.shape[0]) print("Features: ", features.shape[1]) np.save(features_path, features) np.save(labels_path, labels) input_file.close() metadata["features_min"] = scaler.data_min_.tolist() metadata["features_max"] = scaler.data_max_.tolist() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def read_svm_pred(test_sent, input_file): input = open(input_file) ranks = [] for line in input: ranks.append(float(line.strip())) ranks = MinMaxScaler().fit_transform(ranks) for i, sent in enumerate(test_sent): test_sent[i].pred = ranks[i] test_sent[i].pred_label = 1 if ranks[i] >= 0.5 else 0 return test_sent
def default_credit_card_transform(input_path, features_path, labels_path, metadata_path): input_file = open(input_path, "r") reader = csv.DictReader(input_file) variables = set(reader.fieldnames) variables.remove("ID") variables.remove("default payment next month") metadata = create_metadata(variables, TYPES, VALUES, NUM_SAMPLES, CLASSES) features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform for i, row in enumerate(reader): # the categorical variables are already one hot encoded for j, variable in enumerate(metadata["variables"]): value = row[variable] if TYPES[variable] == "numerical": value = float(value) features[i, metadata["value_to_index"][variable]] = value elif TYPES[variable] == "categorical": value = value.replace(".0", "") assert value in VALUES[variable], \ "'{}' is not a valid value for '{}'".format(value, variable) features[i, metadata["value_to_index"][variable][value]] = 1.0 # the class needs to be transformed labels[i] = int(row["default payment next month"].replace(".0", "")) # scale scaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler.fit_transform(features) assert i == metadata["num_samples"] - 1 num_positive_samples = int(labels.sum()) num_negative_samples = labels.shape[0] - num_positive_samples print("Negative samples: ", num_negative_samples) print("Positive samples: ", num_positive_samples) print("Total samples: ", features.shape[0]) print("Features: ", features.shape[1]) np.save(features_path, features) np.save(labels_path, labels) input_file.close() metadata["features_min"] = scaler.data_min_.tolist() metadata["features_max"] = scaler.data_max_.tolist() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def test_warning_scaling_integers(): """Check warning when scaling integer data""" X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) with warnings.catch_warnings(record=True): warnings.simplefilter("always") assert_warns(UserWarning, StandardScaler().fit, X) with warnings.catch_warnings(record=True): warnings.simplefilter("always") assert_warns(UserWarning, MinMaxScaler().fit, X)
def make_models(X, y, y_bin): return dict(ols=LinearRegression().fit(X, y), lr_bin=LogisticRegression().fit(X, y_bin), lr_ovr=LogisticRegression(multi_class='ovr').fit(X, y), lr_mn=LogisticRegression(solver='lbfgs', multi_class='multinomial').fit(X, y), svc=SVC(kernel='linear').fit(X, y_bin), svr=SVR(kernel='linear').fit(X, y), dtc=DecisionTreeClassifier(max_depth=4).fit(X, y), dtr=DecisionTreeRegressor(max_depth=4).fit(X, y), rfc=RandomForestClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), rfr=RandomForestRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbc=GradientBoostingClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbr=GradientBoostingRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), abc=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y), abc2=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y), abc3=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y_bin), abc4=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y_bin), km=KMeans(1).fit(X), km2=KMeans(5).fit(X), pc1=PCA(1).fit(X), pc2=PCA(2).fit(X), pc3=PCA(2, whiten=True).fit(X), mlr1=MLPRegressor([2], 'relu').fit(X, y), mlr2=MLPRegressor([2, 1], 'tanh').fit(X, y), mlr3=MLPRegressor([2, 2, 2], 'identity').fit(X, y), mlc=MLPClassifier([2, 2], 'tanh').fit(X, y), mlc_bin=MLPClassifier([2, 2], 'identity').fit(X, y_bin), bin=Binarizer(0.5), mms=MinMaxScaler().fit(X), mas=MaxAbsScaler().fit(X), ss1=StandardScaler().fit(X), ss2=StandardScaler(with_mean=False).fit(X), ss3=StandardScaler(with_std=False).fit(X), n1=Normalizer('l1'), n2=Normalizer('l2'), n3=Normalizer('max'))
def get_serialized_pipeline(train): from src.features import counting_feat, knn_similarity config = get_config() feature_names = [ file_name for file_name in listdir(config['features_dump_dir']) ] return Pipeline([ ('read', ReadFeatures(feature_names)), ("train_search", knn_similarity.TrainSearch(train=train)), ('tfidf', counting_feat.BagOfTfIDF(train)), # cb ('transform', ToMatrix(features=feature_names)), ('norm', MinMaxScaler()) ])
def get_pipeline(features, to_matrix=True): """ Constructs a pipeline with the given features. Adds dict to matrix of features transformer and a scaler. """ feature_names = [] for feature in features: feature_names += feature[1].FEATS if to_matrix: return Pipeline(features + [('transform', ToMatrix(features=feature_names)), ('norm', MinMaxScaler())]) else: return Pipeline(features)
def classify(X_train, y_train, X_test, y_test): scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) liberatore_NB = GaussianNB() liberatore_NB.fit(X_train, y_train) del X_train X_test = scaler.transform(X_test) predictions = liberatore_NB.predict(X_test) return y_test, predictions
def correct_values(values, min_value, max_value): ''' Ensures that values are in given range @param values: 1d numpy array ''' # scale # do nothing if valid values lowest_val = np.min(values) largest_val = np.max(values) lowest_val_valid = lowest_val >= min_value and lowest_val < max_value largest_val_valid = largest_val <= max_value and largest_val > min_value #print("allowed: min_val: ", min_value, " max_val: ", max_value) #print("current: min_val: ", lowest_val, "max_val: ", largest_val) if lowest_val_valid and largest_val_valid: pass else: #print("at least one not valid") # +/-1 to prevent AssertionErrors caused by rounding errors # -> +/-1 introduces new excpetion: "ValueError: Minimum of desired # feature range must be smaller than maximum. Got (84.80001171045868, # 84). -> Therefore used without +-1 and adapted assertions. min_value_for_scaler = min_value # + 1 max_value_for_scaler = max_value # - 1 # re-use max/min values in data if valid, otherwise all functions would # be in same range if lowest_val_valid: #print("lowest valid") min_value_for_scaler = lowest_val if largest_val_valid: #print("largest valid") max_value_for_scaler = largest_val scaler = MinMaxScaler(feature_range=( min_value_for_scaler, max_value_for_scaler)) reshaped_values = values.reshape(-1, 1) # otherwise DeprecationWarning scaler = scaler.fit(reshaped_values) values = scaler.transform(reshaped_values) values = np.reshape(values, len(values)) # original shape # print("afterwards: min_val: ", np.min( # values), " max_val: ", np.max(values)) min_in_scaled = np.min(values) max_in_scaled = np.max(values) # test whether min_value <= min_in_scaled assert min_value - min_in_scaled <= 0.0000001, "current min: " + \ str(min_in_scaled) + "but allowed min is: " + str(min_value) # test wheter max_in_scaled <= max_value assert max_in_scaled - max_value <= 0.000001, "current max: " + str(max_in_scaled) + \ " but allowed max is: " + str(max_value) return values
def xtraintestdata(self, datarray, yarray, dfiletowrite): x_train, x_test, y_train, y_test = train_test_split(datarray, yarray, test_size=0.2, random_state=1) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1) min_max_scaler = MinMaxScaler() # feed in a numpy array x_train_norm = min_max_scaler.fit_transform(x_train) _ = np.c_[x_train_norm, y_train] dirme = dfiletowrite sio.savemat(dirme, mdict={'UCIDat': yarray}) xy_valid = np.c_[x_val, y_val] xy_train = np.c_[x_train, y_train] xy_test = np.c_[x_test, y_test] return xy_train, xy_test, xy_valid
def get_serialized_pipeline(train): from src.features import counting_feat, knn_similarity # config = get_config() features_dump_dir = 'data/claim-rank/feats_dumps' black_list = ['polarity', 'subjectivity', 'sent_nrc', 'discourse_rel', 'discourse_it', 'in_chunk_last_it', 'in_chunk_last_rel', 'in_chunk_first_it', 'in_chunk_first_rel' ] read_feature_names = [file_name for file_name in listdir(features_dump_dir) if file_name not in black_list] all_feature_names = read_feature_names + counting_feat.BagOfTfIDFN.FEATS + knn_similarity.TrainSearch.FEATS print(all_feature_names) return Pipeline([('read', ReadFeatures(read_feature_names)), ("train_search", knn_similarity.TrainSearch(train=train)), ('tfidf', counting_feat.BagOfTfIDFN(train)), # cb ('transform', ToMatrix(features=all_feature_names)), ('norm', MinMaxScaler())])
def classifier_dyer2012(X_train, y_train, X_test, y_test, time_train=None, time_test=None): obj = Dyer2012VNGPlusPlusClassifier() X_train, fields = dyer2012_tracestoInstances(obj, X_train, time_train) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) models1 = { 'Bernoulli': BernoulliNB(), 'Gaussian': GaussianNB(), 'Multinomial': MultinomialNB(), } params1 = { 'Bernoulli': {}, 'Gaussian': {}, 'Multinomial': {}, #'SVC': [ # {'kernel': ['linear'], 'C': [1, 10]}, # {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]}, #] } dyer_NB=MultinomialNB() dyer_NB.fit(X_train, y_train) del X_train #test X_test, fields = dyer2012_tracestoInstances(obj, X_test, time_test, fields) X_test = scaler.transform(X_test) predictions = dyer_NB.predict(X_test) del X_test labels = [] for l in y_train: if l not in labels: labels.append(l) return y_test, predictions
def load_data(n_samples, label_scaling: bool = False, method: str = 'maxmin'): """Take in Brian's data and spit out some numpy arrays for the PAL""" df_full_factorial_feat = pd.read_csv( os.path.join(DATADIR, 'new_features_full_random.csv'))[FEATURES].values a2 = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values deltaGMax = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values # pylint:disable=unused-variable gibbs = pd.read_csv(os.path.join( DATADIR, 'b1-b21_random_deltaG.csv'))['deltaGmin'].values * (-1) gibbs_max = pd.read_csv( os.path.join(DATADIR, 'b1-b21_random_virial_large_new.csv'))['deltaGmax'].values force_max = pd.read_csv( os.path.join( DATADIR, 'b1-b21_random_virial_large_fit2.csv'))['F_repel_max'].values # pylint:disable=unused-variable rg = pd.read_csv(os.path.join(DATADIR, 'rg_results.csv'))['Rg'].values y = np.hstack( [rg.reshape(-1, 1), gibbs.reshape(-1, 1), gibbs_max.reshape(-1, 1)]) assert len(df_full_factorial_feat) == len(a2) == len(gibbs) == len(y) feat_scaler = StandardScaler() X = feat_scaler.fit_transform(df_full_factorial_feat) if label_scaling: label_scaler = MinMaxScaler() y = label_scaler.fit_transform(y) if method == 'maxmin': greedy_indices = get_maxmin_samples(X, n_samples) elif method == 'kmeans': greedy_indices = get_kmeans_samples(X, n_samples) return X, y, greedy_indices
def prepare_df_for_violinplot(df, feature_cols, class_col, class_indices=None, minmaxscale=True): """ Min-max-scale the data and then melt the dataframe into the long format """ if class_indices: df = df.loc[list(class_indices)] df = df[feature_cols + [class_col]] if minmaxscale: from sklearn.preprocessing.data import MinMaxScaler scaler = MinMaxScaler() df[feature_cols] = scaler.fit_transform(df[feature_cols]) prepared_df = pd.melt(df, value_vars=feature_cols, id_vars=class_col) return prepared_df
def calculate_scores(data_frame): data_frame = filter_data(data_frame) min_max_scaler = MinMaxScaler() data_frame['cycle_time'] = data_frame['distance_to_ucd'] * 6 data_frame['price_score'] = 1 - min_max_scaler.fit_transform( data_frame[['price']]) data_frame['cycle_time_score'] = 1 - min_max_scaler.fit_transform( data_frame[['cycle_time']]) data_frame['money'] = \ data_frame['price'] + data_frame['cycle_time'] * 22 * TIME_PRICE_PER_HOUR / 60 data_frame['money_score'] = 1 - min_max_scaler.fit_transform( data_frame[['money']]) data_frame['score'] =\ data_frame['price_score'] + data_frame['cycle_time_score'] data_frame['score'] = min_max_scaler.fit_transform(data_frame[['score']]) data_frame['money_rank'] = data_frame['money'].rank( ascending=False) / (len(data_frame)) pandas.options.display.max_colwidth = 200 return data_frame