def check_transformer_pickle(name, Transformer): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() if not hasattr(transformer, 'transform'): return set_random_state(transformer) set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: random_state = np.random.RandomState(seed=12345) y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))]) y_ = y_.T else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit(X, y_).transform(X) pickled_transformer = pickle.dumps(transformer) unpickled_transformer = pickle.loads(pickled_transformer) pickled_X_pred = unpickled_transformer.transform(X) assert_array_almost_equal(pickled_X_pred, X_pred)
def check_classifiers_classes(name, Classifier): X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 y_names = np.array(["one", "two", "three"])[y] for y_names in [y_names, y_names.astype('O')]: if name in ["LabelPropagation", "LabelSpreading"]: # TODO some complication with -1 label y_ = y else: y_ = y_names classes = np.unique(y_) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() if name == 'BernoulliNB': classifier.set_params(binarize=X.mean()) set_fast_parameters(classifier) # fit classifier.fit(X, y_) y_pred = classifier.predict(X) # training set performance assert_array_equal(np.unique(y_), np.unique(y_pred)) if np.any(classifier.classes_ != classes): print("Unexpected classes_ attribute for %r: " "expected %s, got %s" % (classifier, classes, classifier.classes_))
def loadData(path="../data/",k=5,log='add',pca_n=0,SEED=34): from pandas import DataFrame, read_csv from numpy import log as ln from sklearn.cross_validation import KFold from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import StandardScaler train = read_csv(path+"train.csv") test = read_csv(path+"test.csv") id = test.id target = train.target encoder = LabelEncoder() target_nnet = encoder.fit_transform(target).astype('int32') feat_names = [x for x in train.columns if x.startswith('feat')] train = train[feat_names].astype(float) test = test[feat_names] if log == 'add': for v in train.columns: train[v+'_log'] = ln(train[v]+1) test[v+'_log'] = ln(test[v]+1) elif log == 'replace': for v in train.columns: train[v] = ln(train[v]+1) test[v] = ln(test[v]+1) if pca_n > 0: from sklearn.decomposition import PCA pca = PCA(pca_n) train = pca.fit_transform(train) test = pca.transform(test) scaler = StandardScaler() scaler.fit(train) train = DataFrame(scaler.transform(train),columns=['feat_'+str(x) for x in range(train.shape[1])]) test = DataFrame(scaler.transform(test),columns=['feat_'+str(x) for x in range(train.shape[1])]) cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED) return train, test, target, target_nnet, id, cv, encoder
def transformTestData(self, train_data, test_data): #Select the right features for both training and testing data X_train, y_train = self.__selectRelevantFeatures(train_data) X_test, y_test = self.__selectRelevantFeatures(test_data) #Transform categorical variables into integer labels martial_le = LabelEncoder() occupation_le = LabelEncoder() relationship_le = LabelEncoder() race_le = LabelEncoder() sex_le = LabelEncoder() transformers = [martial_le, occupation_le, relationship_le, race_le, sex_le] for i in range(len(transformers)): X_train[:,i] = transformers[i].fit_transform(X_train[:,i]) X_test[:,i] = transformers[i].transform(X_test[:,i]) #Dummy code categorical variables dummy_code = OneHotEncoder(categorical_features = range(5)) X_train = dummy_code.fit_transform(X_train).toarray() X_test = dummy_code.transform(X_test).toarray() #Normalize all features scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) #Encode y class_le = LabelEncoder() y_train = class_le.fit_transform(y_train) y_test = class_le.transform(y_test) #print class_le.transform(["<=50K", ">50K"]) return X_train, X_test, y_train, y_test
def clustering_approach(self): ''' Cluster user data using various clustering algos IN: self.df_full and self.labels OUT: results to stdout ''' print 'Fitting clustering model' X = self.df_full.values y = self.labels # scale data scaler = StandardScaler() X = scaler.fit_transform(X) # KMeans km_clf = KMeans(n_clusters=2, n_jobs=6) km_clf.fit(X) # swap labels as super-users are in cluster 0 (messy!!) temp = y.apply(lambda x: 0 if x == 1 else 1) print '\nKMeans clustering: ' self.analyse_preds(temp, km_clf.labels_) # Agglomerative clustering print '\nAgglomerative clustering approach: ' ac_clf = AgglomerativeClustering() ac_labels = ac_clf.fit_predict(X) self.analyse_preds(y, ac_labels) return None
def check_clustering(name, Alg): X, y = make_blobs(n_samples=50, random_state=1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) n_samples, n_features = X.shape # catch deprecation and neighbors warnings with warnings.catch_warnings(record=True): alg = Alg() set_fast_parameters(alg) if hasattr(alg, "n_clusters"): alg.set_params(n_clusters=3) set_random_state(alg) if name == 'AffinityPropagation': alg.set_params(preference=-100) alg.set_params(max_iter=100) # fit alg.fit(X) # with lists alg.fit(X.tolist()) assert_equal(alg.labels_.shape, (n_samples,)) pred = alg.labels_ assert_greater(adjusted_rand_score(pred, y), 0.4) # fit another time with ``fit_predict`` and compare results if name is 'SpectralClustering': # there is no way to make Spectral clustering deterministic :( return set_random_state(alg) with warnings.catch_warnings(record=True): pred2 = alg.fit_predict(X) assert_array_equal(pred, pred2)
def check_transformer_general(name, Transformer): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) X -= X.min() _check_transformer(name, Transformer, X, y) _check_transformer(name, Transformer, X.tolist(), y.tolist())
def cross_valid(data, classifier, x_cols, y_col, **kwargs): # Do train-test split for cross-validation size = len(data) kf = train_test_split(size) y_pred = np.zeros(size) y_pred_prob = np.zeros(size) y = data[y_col].as_matrix().astype(np.float) totaltime_train = 0 totaltime_test = 0 for train_index, test_index in kf: # Fill in missing values df = data.copy() df = fill_missing_median(df, train_index) # Transform and normalize X = df[x_cols].as_matrix().astype(np.float) scaler = StandardScaler() X = scaler.fit_transform(X) # Build classifier and yield predictions y_pred[test_index], y_pred_prob[test_index], train_time, test_time \ = model(X, y, train_index, test_index, classifier, **kwargs) totaltime_train += train_time totaltime_test += test_time avgtime_train = train_time/len(kf) avgtime_test = test_time/len(kf) return y, y_pred, y_pred_prob, avgtime_train, avgtime_test
def buildTreeRegressor(predictorColumns, structurestable = 'structures.csv', targetcolumn = 'c_a', md = None): """ Build a random forest-regressor model to predict some structure feature from compositional data. Will return the model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values """ df = pd.read_csv(structurestable) df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] s = StandardScaler() X = s.fit_transform(df[predictorColumns].astype('float64')) y = df[targetcolumn].values rfr = RandomForestRegressor(max_depth = md) acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error))) X_train, X_test, y_train, y_test = train_test_split(X,y) rfr.fit(X_train,y_train) y_predict = rfr.predict(X_test) t = pd.DataFrame({'True':y_test, 'Predicted':y_predict}) rfr.fit(X, y) return rfr, t, round(acc,2)
def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv', targetcolumn = 'pointGroup', md = None): """ Build a random forest-classifier model to predict some structure feature from compositional data. Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object """ df = pd.read_csv(structurestable) df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] s = StandardScaler() le = LabelEncoder() X = s.fit_transform(df[predictorColumns].astype('float64')) y = le.fit_transform(df[targetcolumn].values) rfc = RandomForestClassifier(max_depth = md) acc = mean(cross_val_score(rfc, X, y)) X_train, X_test, y_train, y_test = train_test_split(X,y) rfc.fit(X_train,y_train) y_predict = rfc.predict(X_test) cm = confusion_matrix(y_test, y_predict) cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_) rfc.fit(X, y) return rfc, cm, round(acc,2), le
def buildCoordinationTreeRegressor(predictorColumns, element, coordinationDir = 'coordination/', md = None): """ Build a coordination predictor for a given element from compositional structure data of structures containing that element. Will return a model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values """ try: df = pd.read_csv(coordinationDir + element + '.csv') except Exception: print 'No data for ' + element return None, None, None df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] if(len(df) < 4): print 'Not enough data for ' + element return None, None, None s = StandardScaler() X = s.fit_transform(df[predictorColumns].astype('float64')) y = df['avgCoordination'].values rfr = RandomForestRegressor(max_depth = md) acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error))) X_train, X_test, y_train, y_test = train_test_split(X,y) rfr.fit(X_train,y_train) y_predict = rfr.predict(X_test) t = pd.DataFrame({'True':y_test, 'Predicted':y_predict}) rfr.fit(X, y) return rfr, t, round(acc,2)
def main(): t0 = time.time() # start time # output files path TRAINX_OUTPUT = "../../New_Features/train_x_processed.csv" TEST_X_OUTPUT = "../../New_Features/test__x_processed.csv" # input files path TRAIN_FILE_X1 = "../../ML_final_project/sample_train_x.csv" TRAIN_FILE_X2 = "../../ML_final_project/log_train.csv" TEST__FILE_X1 = "../../ML_final_project/sample_test_x.csv" TEST__FILE_X2 = "../../ML_final_project/log_test.csv" # load files TRAIN_DATA_X1 = np.loadtxt(TRAIN_FILE_X1, delimiter=',', skiprows=1, usecols=(range(1, 18))) TEST__DATA_X1 = np.loadtxt(TEST__FILE_X1, delimiter=',', skiprows=1, usecols=(range(1, 18))) TRAIN_DATA_X2 = logFileTimeCount(np.loadtxt(TRAIN_FILE_X2, delimiter=',', skiprows=1, dtype=object)) TEST__DATA_X2 = logFileTimeCount(np.loadtxt(TEST__FILE_X2, delimiter=',', skiprows=1, dtype=object)) # combine files TRAIN_DATA_X0 = np.column_stack((TRAIN_DATA_X1, TRAIN_DATA_X2)) TEST__DATA_X0 = np.column_stack((TEST__DATA_X1, TEST__DATA_X2)) # data preprocessing scaler = StandardScaler() TRAIN_DATA_X = scaler.fit_transform(TRAIN_DATA_X0) TEST__DATA_X = scaler.transform(TEST__DATA_X0) # output processed files outputXFile(TRAINX_OUTPUT, TRAIN_DATA_X) outputXFile(TEST_X_OUTPUT, TEST__DATA_X) t1 = time.time() # end time print "...This task costs " + str(t1 - t0) + " second."
def knn(x_train, y_train, x_valid): x_train=np.log(x_train+1) x_valid=np.log(x_valid+1) where_are_nan = np.isnan(x_train) where_are_inf = np.isinf(x_train) x_train[where_are_nan] = 0 x_train[where_are_inf] = 0 where_are_nan = np.isnan(x_valid) where_are_inf = np.isinf(x_valid) x_valid[where_are_nan] = 0 x_valid[where_are_inf] = 0 scale=StandardScaler() scale.fit(x_train) x_train=scale.transform(x_train) x_valid=scale.transform(x_valid) #pca = PCA(n_components=10) #pca.fit(x_train) #x_train = pca.transform(x_train) #x_valid = pca.transform(x_valid) kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1) knn_train, knn_test = stacking(kneighbors, x_train, y_train, x_valid, "knn") return knn_train, knn_test, "knn"
def test_transformers_data_not_an_array(): # test if transformers do something sensible on training set # also test all shapes / shape errors transformers = all_estimators(type_filter='transformer') X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 for name, Transformer in transformers: # XXX: some transformers are transforming the input # data. This is a bug that we'll fix later. Right now we copy # the data each time this_X = NotAnArray(X.copy()) this_y = NotAnArray(np.asarray(y)) if name in dont_test: continue # these don't actually fit the data: if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']: continue # And these wan't multivariate output if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): continue yield check_transformer, name, Transformer, this_X, this_y
def load_train_data(path): print("Loading Train Data") df = pd.read_csv(path) # Remove line below to run locally - Be careful you need more than 8GB RAM rows = np.random.choice(df.index.values, 40000) df = df.ix[rows] # df = df.sample(n=40000) # df = df.loc[df.index] labels = df.target df = df.drop('target',1) df = df.drop('ID',1) # Junk cols - Some feature engineering needed here df = df.fillna(-1) X = df.values.copy() np.random.shuffle(X) X = X.astype(np.float32) encoder = LabelEncoder() y = encoder.fit_transform(labels).astype(np.int32) scaler = StandardScaler() X = scaler.fit_transform(X) return X, y, encoder, scaler
class Regressor(BaseEstimator): def __init__(self): self.clf = Pipeline([ ("RF", RandomForestRegressor(n_estimators=200, max_depth=15, n_jobs=N_JOBS))]) self.scaler = StandardScaler() self.agglo = FeatureAgglomeration(n_clusters=500) def fit(self, X, y): y = y.ravel() n_samples, n_lags, n_lats, n_lons = X.shape self.scaler.fit(X[:, -1].reshape(n_samples, -1)) X = X.reshape(n_lags * n_samples, -1) connectivity = grid_to_graph(n_lats, n_lons) self.agglo.connectivity = connectivity X = self.scaler.transform(X) X = self.agglo.fit_transform(X) X = X.reshape(n_samples, -1) self.clf.fit(X, y) def predict(self, X): n_samples, n_lags, n_lats, n_lons = X.shape X = X.reshape(n_lags * n_samples, -1) X = self.scaler.transform(X) X = self.agglo.transform(X) X = X.reshape(n_samples, -1) return self.clf.predict(X)
def linregress(X_train, X_test, y_train, y_test): coef = [] for col in X_train.columns.tolist(): X = StandardScaler().fit_transform(X_train[col]) lr = LinearRegression() lr.fit(X.reshape(-1, 1), y_train) coef.append([col, lr.coef_]) coef = sorted(coef, key=lambda x: x[1])[::-1] nos = [x[1] for x in coef] labs = [x[0] for x in coef] for lab in labs: if lab == 'doubles': labs[labs.index(lab)] = '2B' elif lab == 'triples': labs[labs.index(lab)] = '3B' elif lab == 'Intercept': idx = labs.index('Intercept') labs.pop(idx) nos.pop(idx) labs = [lab.upper() for lab in labs] x = range(len(nos)) plt.plot(x,nos, lw=2, c='b') plt.xticks(x, labs) plt.title('Linear Regression Coefficients (Win Percentage)') plt.savefig('images/coefficients.png') plt.show() print labs
def monary_load(start=0,stop=-1, find_args={}, species_to_retrieve=[]): if species_to_retrieve == []: species_to_retrieve = species else: species_to_retrieve = [s for s in species_to_retrieve if s in species] query = {} for s in species_to_retrieve: query[s] = {"$gt": 0} find_args["$or"] = [{k:query[k]} for k in query.keys()] with Monary("127.0.0.1") as monary: out = monary.query( "creeval", collection, find_args, num_metadata+cat_metadata+species_to_retrieve, ["float32"] * (len(num_metadata)+len(cat_metadata)+len(species_to_retrieve)), limit=(stop-start), offset=start ) for i,col in enumerate(out[0:len(num_metadata+cat_metadata)]): out[i] = np.ma.filled(col,np.ma.mean(col)) #if any(np.isnan(col)): # print col out = np.ma.row_stack(out).T X = out[:,0:len(num_metadata+cat_metadata)] y = out[:,len(num_metadata+cat_metadata):] y = (y > 0).astype(int) scaler = StandardScaler().fit(X) X = scaler.transform(X) pickle.dump(scaler,open(collection+"_scaler.pkl","wb")) y = np.asarray(y) return DenseDesignMatrix(X=X,y=y)
class PCATransform(BaseEstimator, TransformerMixin): """ PCA with an argument that allows the user to skip the transform altogether. """ def __init__(self, n_components=.1, skip=False, whiten=False, standard_scalar=True): print 'PCA!' self.n_components = n_components self.skip = skip self.whiten = whiten self.standard_scalar = standard_scalar def fit(self, X, y=None): if not self.skip: if self.standard_scalar: self.std_scalar = StandardScaler().fit(X) X = self.std_scalar.transform(X) self.pca = PCA(n_components=self.n_components, whiten=self.whiten).fit(X) return self def transform(self, X, y=None): if not self.skip: if self.standard_scalar: X = self.std_scalar.transform(X) return self.pca.transform(X) return X
def main(trainFile, testFile, outputFile, mode, classifier): """ input: 1. trainFile: the training data features file 2. testFile: the test data file 3. outputFile: the file where the output of the test data has to be written 4. classifier: the classifier to be used """ # scale the input data scaler = StandardScaler() trainingData = getData(trainFile) trainX = trainingData[0] trainY = trainingData[1] trainX = scaler.fit_transform(trainX) testX = [] testY = [] # train the classifier clf = trainClassifier(trainX, trainY, classifier, mode) # if test mode, get test data and predict the output classes if mode == 1: testData = getData(testFile) testX = testData[0] testY = testData[1] testX = scaler.transform(testX) actY = test(testX, clf) testY = testY.reshape(len(testY), 1) # write the predicted class probabilities output = np.concatenate((testY, actY), axis = 1) np.savetxt(outputFile, output, fmt='%s', delimiter=',')
def process(discrete, cont): # Create discrete and continuous data matrices discrete_X = np.array(discrete) cont_X = np.array(cont) # Impute discrete values imp = Imputer(strategy='most_frequent') discrete_X = imp.fit_transform(discrete_X) # Impute continuous values imp_c = Imputer(strategy='mean') cont_X = imp_c.fit_transform(cont_X) # Discrete basis representation enc = OneHotEncoder() enc.fit(discrete_X) discrete_X = enc.transform(discrete_X).toarray() # Continuous scaling scaler = StandardScaler() scaler.fit(cont_X) cont_X = scaler.transform(cont_X) # Merge to one array X = np.concatenate((discrete_X, cont_X), axis=1) return X
class Classifier(BaseEstimator): def __init__(self): self.label_encoder = LabelEncoder() self.scaler = StandardScaler() self.clf = None def fit(self, X, y): X = self.scaler.fit_transform(X.astype(np.float32)) y = self.label_encoder.fit_transform(y).astype(np.int32) dtrain = xgb.DMatrix( X, label=y.astype(np.float32)) param = {'objective':'multi:softprob', 'eval_metric':'mlogloss'} param['nthread'] = 4 param['num_class'] = 9 param['colsample_bytree'] = 0.55 param['subsample'] = 0.85 param['gamma'] = 0.95 param['min_child_weight'] = 3.0 param['eta'] = 0.05 param['max_depth'] = 12 num_round = 400 # to be faster ?? #num_round = 820 self.clf = xgb.train(param, dtrain, num_round) def predict(self, X): X = self.scaler.transform(X.astype(np.float32)) dtest = xgb.DMatrix(X) label_index_array = np.argmax(self.clf.predict(dtest), axis=1) return self.label_encoder.inverse_transform(label_index_array) def predict_proba(self, X): X = self.scaler.transform(X.astype(np.float32)) dtest = xgb.DMatrix(X) return self.clf.predict(dtest)
def normalize( training_data, test_data ): scaler = StandardScaler() values = scaler.fit_transform( training_data ) training_data = pd.DataFrame( values, columns=training_data.columns, index=training_data.index ) values = scaler.transform( test_data ) test_data = pd.DataFrame( values, columns=test_data.columns, index=test_data.index ) return training_data, test_data
def load_data_csv_advanced(datafile): """ Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns. :param datafile: path of the file :return: a NumPy array containing a data point in each row """ # File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location # will be at the column named 'x' in the CSV file. _COLUMN_X = 'x' _COLUMN_Y = 'y' data = pd.read_csv(datafile) # Normalize scaler = StandardScaler() scaler.fit(data[[_COLUMN_X, _COLUMN_Y]]) data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]]) # Get feature vector names by removing "x" and "y" feature_vector_names = data.columns.difference([_COLUMN_X, _COLUMN_Y]) data_coords = data[[_COLUMN_X, _COLUMN_Y]].values result = {"coordinates": data_coords} for feature in feature_vector_names: data_words = [[e.strip() for e in venue_data.split(",")] for venue_data in data[feature].values.flatten().tolist()] result[feature] = data_words return sparsify_data(result, None, None), scaler # None for both params since SVD is not used
def lassoRegression(X,y): print("\n### ~~~~~~~~~~~~~~~~~~~~ ###") print("Lasso Regression") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myDegree = 40 polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False) Xp = polynomialFeatures.fit_transform(X) myScaler = StandardScaler() scaled_Xp = myScaler.fit_transform(Xp) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### lassoRegression = Lasso(alpha=1e-7) lassoRegression.fit(scaled_Xp,y) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dummyX = np.arange(0,2,0.01) dummyX = dummyX.reshape((dummyX.shape[0],1)) dummyXp = polynomialFeatures.fit_transform(dummyX) scaled_dummyXp = myScaler.transform(dummyXp) dummyY = lassoRegression.predict(scaled_dummyXp) outputFILE = 'plot-lassoRegression.png' fig, ax = plt.subplots() fig.set_size_inches(h = 6.0, w = 10.0) ax.axis([0,2,0,15]) ax.scatter(X,y,color="black",s=10.0) ax.plot(dummyX, dummyY, color='red', linewidth=1.5) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def prepare_features(data, enc=None, scaler=None): ''' One-hot encode all boolean/string (categorical) features, and shift/scale integer/float features ''' # X needs to contain only non-negative integers bfs = data['bfeatures'] + 1 sfs = data['sfeatures'] + 1 # Shift/scale integer and float features to have mean=0, std=1 ifs = data['ifeatures'] ffs = data['ffeatures'] x2 = np.hstack((ifs,ffs)) if scaler is None: scaler = StandardScaler() x2 = scaler.fit_transform(x2) print "Training features have mean: %s" % scaler.mean_ print "and standard deviation: %s" % scaler.std_ else: x2 = scaler.transform(x2, copy=False) # one-hot encode categorical features X = np.hstack((bfs,sfs,x2)) categorical = np.arange(bfs.shape[1]+sfs.shape[1]) if enc is None: enc = OneHotEncoder(n_values='auto', categorical_features=categorical) X = enc.fit_transform(X) print "One-hot encoded features have dimension %d" % X.shape[1] else: X = enc.transform(X) return X, enc, scaler
def load_data_csv(datafile): """ Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns. :param datafile: path of the file :return: a NumPy array containing a data point in each row """ # File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location # will be at the column named 'x' in the CSV file. # This will be useful later when we start adding more features. _COLUMN_X = 'x' _COLUMN_Y = 'y' _COLUMN_W = 'color' data = pd.read_csv(datafile) # Normalize scaler = StandardScaler() scaler.fit(data[[_COLUMN_X, _COLUMN_Y]]) data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]]) data_coords = data[[_COLUMN_X, _COLUMN_Y]].values data_words = [[e] for e in data[[_COLUMN_W]].values.flatten().tolist()] data = {"coordinates": data_coords, "words": data_words} return sparsify_data(data, None, None), scaler # None for both params since SVD is not used
def run_model( model, model_name, X, Y, X_val): new_values = [ [x] for x in range(len(X))] X = numpy.append(X, new_values, 1) from sklearn.preprocessing import StandardScaler # I have a suspicion that the classifier might work better without the scaler scaler = StandardScaler().fit(X) X = scaler.transform(X) max_time_val = X[-1][-1] *2 - X[-2][-1] Y = make_black_maps_class(Y) # Load validation data model.fit(X, Y) new_values = [ [max_time_val] for x in range(len(X_val))] X_val = numpy.append(X_val, new_values, 1) # Now predict validation output Y_pred = model.predict(X_val) # Crop impossible values Y_pred[Y_pred < 0] = 0 Y_pred[Y_pred > 600] = 600 savetxt('final_pred_y{0}.csv'.format(model_name), Y_pred, delimiter=',') black_map_count = 0 for y in Y_pred: if y == 600: black_map_count += 1 print black_map_count, model_name sys.stdout.flush()
def test_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
def train_and_test(train_books, test_books, train, scale=True): X_train, y_train, cands_train, features = get_pair_data(train_books, True) X_test, y_test, cands_test, features = get_pair_data(test_books) scaler = None if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) print sum(y_train)*0.1/len(y_train) print 'Start training' print X_train.shape clf = train(X_train, y_train) print 'Done training' y_train_pred = clf.predict(X_train) y_test_pred = clf.predict(X_test) ''' # print performance for training books print "--------------Traning data-------------" train_perf = evaluate_books(clf, train_books, scaler, evaluate_pair) # print performance for testing books print "\n" print "--------------Testing data-------------" test_perf = evaluate_books(clf, test_books, scaler, evaluate_pair) ''' print 'Train Non-unique Precision:', precision(y_train_pred, y_train), 'Non-unique Recall:', recall(y_train_pred, y_train) print 'Test Non-unique Precision:', precision(y_test_pred, y_test), 'Recall:', recall(y_test_pred, y_test) return clf, scaler, X_train, y_train, X_test, y_test
X.columns[feats.get_support()] ############ SEQUENTIAL FEATURE SELECTION ################# #Includes Forward Selection vs Backward selection from sklearn.neighbors import KNeighborsClassifier from mlxtend.feature_selection import SequentialFeatureSelector from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=4, n_jobs=-1))]) # Forward Selection selector = SequentialFeatureSelector(knn_pipe, scoring='accuracy', forward=True, floating=False, k_features=3, verbose=2, n_jobs=-1, cv=5) selector.fit(X=X, y=y) selector.subsets_ selector.k_feature_idx_ selector.k_feature_names_
def run_sim(n, path, simtitle, REVOLUTION, PROBLEM, seed): POP_SIZE = 100 MAX_GENERATION = 50 MAX_EPISODE = 100 MAX_EVAL = 1000 STOPPING_RULE = 'max_eval' MUTATION_RATE = 0.1 MUTATION_U = 0. MUTATION_ST = 0.2 REF = [1., 1.] MINIMIZE = True VERBOSE = True X_SCALER = StandardScaler() # Set global numpy random_state np.random.seed(seed) theo = PROBLEM.solutions() # Instantiate a population pop = MOPRISM(size=POP_SIZE, problem=PROBLEM, max_generation=MAX_GENERATION, max_episode=MAX_EPISODE, reference=REF, minimize=MINIMIZE, stopping_rule=STOPPING_RULE, max_eval=MAX_EVAL, mutation_rate=MUTATION_RATE, revolution=REVOLUTION, embedded_ea=MOEAD, verbose=VERBOSE, no_improvement_step_tol=3) pop.selection_fun = pop.compute_front pop.mutation_fun = gaussian_mutator pop.crossover_fun = random_crossover # Parametrization params_ea = { 'u': MUTATION_U, 'st': MUTATION_ST, 'trial_method': 'lhs', 'trial_criterion': 'cm' } kernel = CubicKernel tail = LinearTail params_surrogate = \ {'kernel': kernel, 'tail': tail, 'maxp': MAX_EVAL + POP_SIZE, 'eta': 1e-8, } # ===============================Initialization============================ pop.config_surrogate(typ='rbf', params=params_surrogate, n_process=1, X_scaler=X_SCALER, warm_start=True) pop.config_gap_opt(at='least_crowded', radius=0.1, size=POP_SIZE, max_generation=MAX_GENERATION, selection_fun=None, mutation_fun=None, mutation_rate=None, crossover_fun=random_crossover, trial_method='lhs', trial_criterion='cm', u=0., st=0.2) pop.config_sampling(methods='default', sizes='default', rate='default', candidates='default') pop.run(params_ea=params_ea, params_surrogate=params_surrogate, theo=theo) # ============================= Save Results ================================ # # path to save directory = path + simtitle + '/' + str(n) + '/' if not os.path.exists(directory): os.makedirs(directory) pop.render_features(pop.true_front).tofile(directory + 'xs.dat') pop.render_targets(pop.true_front).tofile(directory + 'fs.dat') np.array(pop.hypervol_diff).tofile(directory + 'hv_diff.dat') np.array(pop.hypervol_cov_effect).tofile(directory + 'hv_cov.dat') np.array(pop.hypervol_index).tofile(directory + 'hv_ind.dat') # ================================Visualization============================== # # plot_res(pop=pop, ref=theo, directory=directory) return pop
from sklearn.decomposition import PCA from sklearn.decomposition import IncrementalPCA from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split if __name__ == "__main__": dt_heart = pd.read_csv("./data/heart.csv") print(dt_heart.head(5)) dt_features = dt_heart.drop(["target"], axis=1) dt_target = dt_heart["target"] dt_features = StandardScaler().fit_transform(dt_features) X_train, X_test, y_train, y_test = train_test_split(dt_features, dt_target, test_size=0.3, random_state=42) print(X_train.shape, y_train.shape) pca = PCA(n_components=3) pca.fit(X_train) ipca = IncrementalPCA(n_components=3, batch_size=10) ipca.fit(X_train) #plt.plot(range(len(pca.explained_variance_)), pca.explained_variance_ratio_)
def one_day_window_model(): files = glob.glob('../DATA/A*/A*/*.json') sentimentAnalyzer = SentimentIntensityAnalyzer() with open('tweet_sentiment.csv', 'w+') as sfl: for file in files: with open(file) as fl: lines = fl.readlines() tweets = json.loads(lines[0]) for tweet in tweets: date = time.strftime('%Y/%m/%d', time.localtime(int(tweet['time']))) scores = sentimentAnalyzer.polarity_scores(tweet['text']) sfl.write(date + ',' + str(scores['pos']) + ',' + str(scores['neg']) + ',' + str(scores['neu']) + ',' + str(scores['compound'])) sfl.write('\n') prices = pd.read_csv('../DATA/CHARTS/APPLE1440.csv').values all_tweets = pd.read_csv('tweet_sentiment.csv').values with open('features.csv', 'w+') as fl: for price in prices: current_date = datetime.strptime(price[0], '%Y.%m.%d').date() previous_date = current_date - timedelta(days=1) tweets = all_tweets[all_tweets[:, 0] == previous_date.strftime( '%Y/%m/%d')] if len(tweets) != 0: if float(price[5]) > float(price[2]): label = "1" else: label = "0" for tweet in tweets: fl.write(price[0] + ',' + str(tweet[1]) + ',' + str(tweet[2]) + ',' + str(tweet[3]) + ',' + str(tweet[4]) + ',' + label) fl.write('\n') dataset = pd.read_csv('features.csv') X = dataset.iloc[:, [1, 2, 3, 4]].values y = dataset.iloc[:, 5].values scaler = StandardScaler() X[:, :] = scaler.fit_transform(X[:, :]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) rf = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=3) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) cm = confusion_matrix(y_test, y_pred) print("Random Forest Accuracy: " + str((cm[0, 0] + cm[1, 1]) / (cm[0, 0] + cm[1, 1] + cm[1, 0] + cm[0, 1]))) svc = SVC(kernel='poly', random_state=0) svc.fit(X_train, y_train) y_pred = svc.predict(X_test) cm = confusion_matrix(y_test, y_pred) print("SVM Accuracy: " + str((cm[0, 0] + cm[1, 1]) / (cm[0, 0] + cm[1, 1] + cm[1, 0] + cm[0, 1]))) mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100), random_state=10) mlp.fit(X_train, y_train) y_pred = mlp.predict(X_test) cm = confusion_matrix(y_test, y_pred) print("MLP Accuracy: " + str((cm[0, 0] + cm[1, 1]) / (cm[0, 0] + cm[1, 1] + cm[1, 0] + cm[0, 1])))
X_train_norm = mms.fit_transform(X_train) print(X_train_norm) X_test_norm = mms.transform(X_test) print(X_test) # 标准化对于许多线性模型都十分有必要 print('standardized:', (ex - ex.mean()) / ex.std()) from sklearn.preprocessing import StandardScaler stdsc = StandardScaler() X_train_std = stdsc.fit_transform(X_train) X_test_std = stdsc.transform(X_test) # 选择有意义的特征 # 正则化 print('normalized:', (ex - ex.min()) / (ex.max() - ex.min())) from sklearn.linear_model import LogisticRegression lr = LogisticRegression(penalty='l1', C=1.0) lr.fit(X_train_std, y_train)
import DatabaseConnection as dc # Connect to database and get the data home_data = dc.download_housing_data(dc.connect()) # Filter out outliers in the dataset based on the outlier graph from preprocessing # indicating that the majority of the data is in the lower end of the price range. home_data = home_data[home_data['price'] < 1250000] # Visualize the different groupings of houses in the dataset using kmeans cluster analysis. cata_home_data = home_data.copy() # copy of the dataset to maintain integrity price_data = cata_home_data['price'] cata_home_data.drop(['price'],axis=1,inplace=True) # Standardize the data. scaler = StandardScaler() scaler.fit(cata_home_data) scaled_data = scaler.transform(cata_home_data) # Determine the obtimal number of clusters def show_elbow_plot(): """ Displays the elbow graph used to choose the optimal number of clusters for the final cluster graph. """ test_cluster_max = 15 kmeans_tests = [KMeans(n_clusters=i) for i in range(1, test_cluster_max)] score = [kmeans_tests[i].fit(scaled_data).score(scaled_data) for i in range(len(kmeans_tests))] # Plot the curve elbow_plot = plt.plot(range(1, test_cluster_max),score)
#%% # lets do PCA sns.set_style("darkgrid") colors = ['#e6194b', '#0082c8', '#d2f53c', '#3cb44b', '#f032e6', '#911eb4', '#46f0f0', '#f58231', '#008080', '#ffe119'] scalar = StandardScaler() X_train = scalar.fit_transform(X_train) # visulize all attributes in the data set num_components = 2 pca = PCA(n_components = num_components) pca.fit(X_train) print(pca.explained_variance_ratio_) print('var', sum(pca.explained_variance_ratio_)) total_explained_variance = sum(pca.explained_variance_ratio_) train_pca = pca.transform(X_train) # make this more pretty l8ter records, attributes = np.shape(train_pca) train_pca_ones = np.ones((records, attributes + 1)) train_pca_ones[:,1:] = train_pca
def validateRF(): """ run KFOLD method for regression """ #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/merraRFValidation" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 119 y = 120 #empty dataframe for model validation df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse']) #looping through for tg in range(x, y): os.chdir(dir_in) #filter only .csv files tgNames = [] for file in glob.glob("*.csv"): tgNames.append(file) tg_name = sorted(tgNames)[tg] print(tg_name) ########################################## #check if this tg is already taken care of ########################################## os.chdir(dir_out) if os.path.isfile(tg_name): print("this tide gauge is already taken care of") return "file already analyzed!" os.chdir(dir_in) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis=1, inplace=True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1) #standardize predictor data dat = pred.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis=1, inplace=True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis=0, inplace=True) surge.reset_index(inplace=True) surge.drop('index', axis=1, inplace=True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns=['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis=1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:, :2], on='date', how='right') pred_surge.sort_values(by='date', inplace=True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis=1)] pred_surge.drop(row_nan.index, axis=0, inplace=True) pred_surge.reset_index(inplace=True) pred_surge.drop('index', axis=1, inplace=True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-' * 80) print('Predictors and Surge don' 't overlap') print('-' * 80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:, 1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis=1, inplace=True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) #apply 10 fold cross validation kf = KFold(n_splits=10, random_state=29) metric_corr = [] metric_rmse = [] #combo = pd.DataFrame(columns = ['pred', 'obs']) for train_index, test_index in kf.split(X): X_train, X_test = X_pca[train_index], X_pca[test_index] y_train, y_test = y['surge'][train_index], y['surge'][test_index] #train regression model rf= RandomForestRegressor(n_estimators = 50, random_state = 101, \ min_samples_leaf = 1) rf.fit(X_train, y_train) #predictions predictions = rf.predict(X_test) # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # pd.DataFrame(np.array(y_test))], \ # axis = 1) # pred_obs.columns = ['pred', 'obs'] # combo = pd.concat([combo, pred_obs], axis = 0) #evaluation matrix - check p value if stats.pearsonr(y_test, predictions)[1] >= 0.05: print("insignificant correlation!") continue else: print(stats.pearsonr(y_test, predictions)) metric_corr.append(stats.pearsonr(y_test, predictions)[0]) print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) print() metric_rmse.append( np.sqrt(metrics.mean_squared_error(y_test, predictions))) #number of years used to train/test model num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\ pred_surge['date'][0]).days/365 longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components corr = np.mean(metric_corr) rmse = np.mean(metric_rmse) print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' - avg_rmse (m) = ', \ np.mean(metric_rmse), '\n') #original size and pca size of matrix added new_df = pd.DataFrame( [tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T new_df.columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse'] df = pd.concat([df, new_df], axis=0) #save df as cs - in case of interruption os.chdir(dir_out) df.to_csv(tg_name)
def train_and_predict_dragons(t, y_unscaled, x, targeted_regularization=True, output_dir='', knob_loss=dragonnet_loss_binarycross, ratio=1., dragon='', val_split=0.2, batch_size=64): verbose = 0 y_scaler = StandardScaler().fit(y_unscaled) y = y_scaler.transform(y_unscaled) train_outputs = [] test_outputs = [] if dragon == 'tarnet': dragonnet = make_tarnet(x.shape[1], 0.01) elif dragon == 'dragonnet': print("I am here making dragonnet") dragonnet = make_dragonnet(x.shape[1], 0.01) metrics = [ regression_loss, binary_classification_loss, treatment_accuracy, track_epsilon ] if targeted_regularization: loss = make_tarreg_loss(ratio=ratio, dragonnet_loss=knob_loss) else: loss = knob_loss # for reporducing the IHDP experimemt i = 0 tf.random.set_seed(i) np.random.seed(i) # print() train_index, test_index = train_test_split(np.arange(x.shape[0]), random_state=1) test_index = train_index x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] t_train, t_test = t[train_index], t[test_index] yt_train = np.concatenate([y_train, t_train], 1) import time start_time = time.time() dragonnet.compile(optimizer=Adam(lr=1e-3), loss=loss, metrics=metrics) adam_callbacks = [ TerminateOnNaN(), EarlyStopping(monitor='val_loss', patience=2, min_delta=0.), ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=verbose, mode='auto', min_delta=1e-8, cooldown=0, min_lr=0) ] dragonnet.fit(x_train, yt_train, callbacks=adam_callbacks, validation_split=val_split, epochs=100, batch_size=batch_size, verbose=verbose) sgd_callbacks = [ TerminateOnNaN(), EarlyStopping(monitor='val_loss', patience=40, min_delta=0.), ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=verbose, mode='auto', min_delta=0., cooldown=0, min_lr=0) ] sgd_lr = 1e-5 momentum = 0.9 dragonnet.compile(optimizer=SGD(lr=sgd_lr, momentum=momentum, nesterov=True), loss=loss, metrics=metrics) dragonnet.fit(x_train, yt_train, callbacks=sgd_callbacks, validation_split=val_split, epochs=300, batch_size=batch_size, verbose=verbose) elapsed_time = time.time() - start_time print("***************************** elapsed_time is: ", elapsed_time) yt_hat_test = dragonnet.predict(x_test) yt_hat_train = dragonnet.predict(x_train) test_outputs += [ _split_output(yt_hat_test, t_test, y_test, y_scaler, x_test, test_index) ] train_outputs += [ _split_output(yt_hat_train, t_train, y_train, y_scaler, x_train, train_index) ] K.clear_session() return test_outputs, train_outputs
# Importing the dataset dataset = pd.read_csv('Social_Network_Ads.csv') X = dataset.iloc[:, [2, 3]].values y = dataset.iloc[:, 4].values # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting Random Forest Classification to the Training set from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix
#### Initialize Dataset #### data_df = pd.read_csv(settings['dataset_file']) data = { 'outcome_name': data_df.columns[0], 'variable_names': data_df.columns[1:].tolist(), 'X': data_df.iloc[:, 1:], 'y': data_df.iloc[:, 0], 'scaler': None, } if settings['normalize_data']: from sklearn.preprocessing import StandardScaler data['scaler'] = StandardScaler(copy = True, with_mean = True, with_std = True) data['X_train'] = pd.DataFrame(data['scaler'].fit_transform(data['X'], data['y']), columns = data['X'].columns) else: data['X_train'] = data['X'] #### Initialize Actionset #### default_bounds = (0.1, 99.9, 'percentile') custom_bounds = None immutable_variables = [] if settings['data_name'] == 'credit': immutable_names = ['Female', 'Single', 'Married'] immutable_names += list(filter(lambda x: 'Age' in x or 'Overdue' in x, data['variable_names']))
def train_and_predict_ned(t, y_unscaled, x, targeted_regularization=True, output_dir='', knob_loss=dragonnet_loss_binarycross, ratio=1., dragon='', val_split=0.2, batch_size=64): verbose = 0 y_scaler = StandardScaler().fit(y_unscaled) y = y_scaler.transform(y_unscaled) train_outputs = [] test_outputs = [] nednet = make_ned(x.shape[1], 0.01) metrics_ned = [ned_loss] metrics_cut = [regression_loss] # for reproducing the ihdp result i = 0 tf.random.set_random_seed(i) np.random.seed(i) # change the test_size to get in sample and out sample estimates test_size = 0. train_index, test_index = train_test_split(np.arange(x.shape[0]), test_size=test_size) if test_size == 0: test_index = train_index x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] t_train, t_test = t[train_index], t[test_index] yt_train = np.concatenate([y_train, t_train], 1) nednet.compile(optimizer=Adam(lr=1e-3), loss=ned_loss, metrics=metrics_ned) adam_callbacks = [ TerminateOnNaN(), EarlyStopping(monitor='val_loss', patience=2, min_delta=0.), ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=verbose, mode='auto', min_delta=1e-8, cooldown=0, min_lr=0) ] nednet.fit(x_train, yt_train, callbacks=adam_callbacks, validation_split=val_split, epochs=100, batch_size=batch_size, verbose=verbose) sgd_callbacks = [ TerminateOnNaN(), EarlyStopping(monitor='val_loss', patience=40, min_delta=0.), ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=verbose, mode='auto', min_delta=0., cooldown=0, min_lr=0) ] sgd_lr = 1e-5 momentum = 0.9 nednet.compile(optimizer=SGD(lr=sgd_lr, momentum=momentum, nesterov=True), loss=ned_loss, metrics=metrics_ned) print(nednet.summary()) nednet.fit(x_train, yt_train, callbacks=sgd_callbacks, validation_split=val_split, epochs=300, batch_size=batch_size, verbose=verbose) t_hat_test = nednet.predict(x_test)[:, 1] t_hat_train = nednet.predict(x_train)[:, 1] # cutting the activation layer cut_net = post_cut(nednet, x.shape[1], 0.01) cut_net.compile(optimizer=Adam(lr=1e-3), loss=dead_loss, metrics=metrics_cut) adam_callbacks = [ TerminateOnNaN(), EarlyStopping(monitor='val_loss', patience=2, min_delta=0.), ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=verbose, mode='auto', min_delta=1e-8, cooldown=0, min_lr=0) ] cut_net.fit(x_train, yt_train, callbacks=adam_callbacks, validation_split=val_split, epochs=100, batch_size=batch_size, verbose=verbose) sgd_callbacks = [ TerminateOnNaN(), EarlyStopping(monitor='val_loss', patience=40, min_delta=0.), ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=verbose, mode='auto', min_delta=0., cooldown=0, min_lr=0) ] sgd_lr = 1e-5 momentum = 0.9 cut_net.compile(optimizer=SGD(lr=sgd_lr, momentum=momentum, nesterov=True), loss=dead_loss, metrics=metrics_cut) cut_net.fit(x_train, yt_train, callbacks=sgd_callbacks, validation_split=val_split, epochs=300, batch_size=batch_size, verbose=verbose) y_hat_test = cut_net.predict(x_test) y_hat_train = cut_net.predict(x_train) yt_hat_test = np.concatenate([y_hat_test, t_hat_test.reshape(-1, 1)], 1) yt_hat_train = np.concatenate([y_hat_train, t_hat_train.reshape(-1, 1)], 1) test_outputs += [ _split_output(yt_hat_test, t_test, y_test, y_scaler, x_test, test_index) ] train_outputs += [ _split_output(yt_hat_train, t_train, y_train, y_scaler, x_train, train_index) ] K.clear_session() return test_outputs, train_outputs
# ### Regression # ##### I think that this problem may be best generalized with a random forest model. First I will start with a regressor, then I will use a classifier from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.preprocessing import StandardScaler X = df[['temperature', 'humidity', 'IsHoliday', 'WeekDay', 'Season']] y = df['P1'] today = [12, 47, 0, 1, 1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) sc_regr = StandardScaler() X_train = sc_regr.fit_transform(X_train) X_test = sc_regr.transform(X_test) today = sc_regr.transform([today]) regr = RandomForestRegressor() regr_svm = SVR() regr.fit(X_train, y_train) regr_svm.fit(X_train, y_train) regr.predict(today) regr_svm.predict(today)
print (f) x = getDummy(x, f) test = x.iloc[260753:, ] train = x.iloc[:260753:, ] encoder = LabelEncoder() y_train = encoder.fit_transform(y_train).astype(np.int32) y_train = np_utils.to_categorical(y_train) print ("processsing finished") train = np.array(train) train = train.astype(np.float32) test = np.array(test) test = test.astype(np.float32) if need_normalise: scaler = StandardScaler().fit(train) train = scaler.transform(train) test = scaler.transform(test) # folds xfolds = pd.read_csv(projPath + 'input/xfolds.csv') # work with 5-fold split fold_index = xfolds.fold5 fold_index = np.array(fold_index) - 1 n_folds = len(np.unique(fold_index)) nb_classes = 2 print(nb_classes, 'classes') dims = train.shape[1] print(dims, 'dims')
) vals = {} for i, name in enumerate(names): vals[name] = X[:, i] vals[dataset.default_target_attribute] = y df = pd.DataFrame(vals) X = df.drop(task_target, axis=1) y = df.loc[:, task_target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #:# preprocessing transform_pipeline = Pipeline([ ('scaler', StandardScaler()) ]) X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns) #:# model params = {'C': 0.8, 'solver': 'liblinear'} classifier = LogisticRegression(**params) classifier.fit(X_train, y_train) #:# hash #:# 8b4aedd7d78f7193c71d75501c5c1bc6 md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest() print(f'md5: {md5}')
xx.append([]) for j in [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]: xx[i].append(data[i][j]) x = np.array(xx) y = np.array(yy) # print(len(x),len(y)) # 2 分割训练数据和测试数据 # 随机采样25%作为测试 75%作为训练 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33) # 3 训练数据和测试数据进行标准化处理 ss_x = StandardScaler() x_train = ss_x.fit_transform(x_train) x_test = ss_x.transform(x_test) ss_y = StandardScaler() y_train = ss_y.fit_transform(y_train.reshape(-1, 1)) y_test = ss_y.transform(y_test.reshape(-1, 1)) # 4 使用回归树进行训练和预测 # 初始化k近邻回归模型 使用平均回归进行预测 dtr = DecisionTreeRegressor() # 训练 dtr.fit(x_train, y_train) # 预测 保存预测结果 dtr_y_predict = dtr.predict(x_test)
np.random.seed(seed=0) random_state = 0 ####### # DADOS: ####### # a) para aplicacao com dados 'reais', utilizamos o Wine dataset: # Nesse caso, apos gerar os dados abaixo, rodar o codigo somente a partir da linha 54 ateh 227, para evitar as replicados da simulacao de monte carlo. wine = datasets.load_wine() X = wine.data y = wine.target data = DataFrame(X) # padronizacao das variaveis para ficarem em uma mesma escala: scaler = StandardScaler() data = DataFrame(scaler.fit_transform(data), index=data.index, columns=data.columns) # visualizacao dos dados apenas ateh a quarta variavel, pra nao poluir tanto o grafico: plot = sns.pairplot(data.iloc[:,0:4]) cols = data.columns # ou: # b) com dados simulados: # quantidade de replicacoes: reps = 1000 acertou = [] for rep in tqdm(range(reps)):
from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.metrics import * import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=UserWarning) warnings.filterwarnings("ignore", category=FutureWarning) #data = pd.read_csv("../../data/data_all_float.csv", header=0, index_col=None, sep=';') # drop 'CTE, linear' for more instances to study drop_feature = ['Density', 'CTE, linear'] for ifeature in drop_feature: data = data.drop(labels=ifeature, axis=1) stdscale = StandardScaler() elements = [ 'Iron, Fe', 'Carbon, C', 'Sulfur, S', 'Silicon, Si', 'Phosphorous, P', 'Manganese, Mn', 'Chromium, Cr', 'Nickel, Ni', 'Molybdenum, Mo', 'Copper, Cu' ] target = 'Thermal Conductivity' # drop instances with NaN drop_instance = [] for idx in data.index: if math.isnan(data.loc[idx, target]): drop_instance.append(idx) data_loc = data.drop(drop_instance) X = data_loc[elements].values
def pls_variable_selection(x, y, num_pls_components): """ Adopted from https://nirpyresearch.com/variable-selection-method-pls-python/ :param x: :param y: :param max_components: :param scorer: :return: """ # initialize new model parameter holder scores = dict() scores['r2'] = ModelFit() scores['mae'] = ModelFit() cut_conditions = [] num_varaibles = [] # make a score table to fill in # scores = np.zeros( x.shape[1] ) # print('==========') pls = PLSRegression(num_pls_components) usable_columns = None best_score = 0 x_scaled_np = StandardScaler().fit_transform(x) x_scaled = pd.DataFrame(x_scaled_np, columns=x.columns) # print(x_scaled) while x_scaled.shape[1] >= num_pls_components: print('shape: ', x_scaled.shape, num_pls_components, best_score) number_to_cut = int(x_scaled.shape[1] / 100) if number_to_cut == 0: number_to_cut = 1 # print('number to cut: ', number_to_cut, num_pls_components) pls.fit(x_scaled, y) # y_predict = pls.predict(x_scaled) # score = r2_score(y, y_predict) cv_splitter = 3 # passing to corss_validate will implement a KFold with 3 folds group_splitter = None if x_scaled.shape[1] <= 200: # cv_splitter = ShuffleSplit(n_splits=100, test_size=0.35) cv_splitter = GroupShuffleSplit(n_splits=100, test_size=0.35) group_splitter = data_full['Leaf number'] elif x_scaled.shape[1] <= 400: # cv_splitter = ShuffleSplit(n_splits=30, test_size=0.35) cv_splitter = GroupShuffleSplit(n_splits=30, test_size=0.35) group_splitter = data_full['Leaf number'] local_scores = cross_validate( pls, x_scaled, y, cv=cv_splitter, return_train_score=True, groups=group_splitter, scoring=['r2', 'neg_mean_absolute_error']) scores['r2'].train_score.append(local_scores['train_r2'].mean()) scores['r2'].train_stdev.append(local_scores['train_r2'].std()) scores['r2'].test_score.append(local_scores['test_r2'].mean()) scores['r2'].test_stdev.append(local_scores['test_r2'].std()) scores['mae'].train_score.append( local_scores['train_neg_mean_absolute_error'].mean()) scores['mae'].train_stdev.append( local_scores['train_neg_mean_absolute_error'].std()) scores['mae'].test_score.append( local_scores['test_neg_mean_absolute_error'].mean()) scores['mae'].test_stdev.append( local_scores['test_neg_mean_absolute_error'].std()) num_varaibles.append(x_scaled.shape[1]) if scores['r2'].test_score[-1] > best_score: best_score = scores['r2'].test_score[-1] usable_columns = x_scaled.columns # print(pls.coef_[:, 0]) # print(pls.coef_.shape) sorted_coeff = np.argsort(np.abs(pls.coef_[:, 0])) # print('1') # print(sorted_coeff) # print( pls.coef_[:, 0][sorted_coeff] ) # print('2') # print(sorted_coeff[-5:]) # print(sorted_coeff[-1]) # print(x_scaled) # print(pls.coef_[:, 0][sorted_coeff[-1]], pls.coef_[:, 0][sorted_coeff[0]]) # print(sorted_coeff[-1], x_scaled.columns[sorted_coeff[0]]) # print(scores['r2'].train_score[-1], scores['r2'].test_score[-1], # scores['mae'].train_score[-1], scores['mae'].test_score[-1]) # column_to_drop = x_scaled.columns[sorted_coeff[0]] columns_to_drop = x_scaled.columns[sorted_coeff[:number_to_cut]] # print(columns_to_drop.values) if x_scaled.shape[1] < 50: # print('dropping: ', columns_to_drop) # print(columns_to_drop.values) cut_conditions.append(columns_to_drop.values) x_scaled.drop(columns=columns_to_drop, inplace=True) # print(usable_columns) # print('===========') # print(x_scaled.columns) # print(cut_conditions) # data = dict() # data['test means'] = test_scores_average # data['test std'] = test_scores_std # data['train means'] = train_scores_average # data['train std'] = train_scores_std # data['columns'] = usable_columns # data['num variables'] = num_varaibles scores['columns'] = usable_columns scores['num variables'] = num_varaibles # print('========') # print(data.keys()) # filename = "param_selector_{0}.pickle".format(num_pls_components) # with open(filename, 'wb') as f: # pickle.dump(data, f, pickle.HIGHEST_PROTOCOL) return scores
# Get dataset and features #==============================# aalist = list('ACDEFGHIKLMNPQRSTVWY') def getAAC(seq): aac = np.array([seq.count(x) for x in aalist]) / len(seq) return aac data = pd.read_excel('sequence_ogt_topt.xlsx', index_col=0) aac = np.array([getAAC(seq) for seq in data['sequence']]) ogt = data['ogt'].values.reshape((data.shape[0], 1)) X = np.append(aac, ogt, axis=1) sc = StandardScaler() X = sc.fit_transform(X) y = data['topt'].values # Strategies and hyperparameters #======================================# # Hyperparameter range cl_vals = [25.0, 30.0, None] ch_vals = [72.2, 60.0] ks = [5, 10, 15] deltas = [0.1, 0.5, 1.0] overs = [0.5, 0.75] unders = [0.5, 0.75] sizes = [300, 600] sample_methods = ['balance', 'extreme', 'average']
def featureScal(X): sc_X = StandardScaler() X_train = sc_X.fit_transform(X) return X_train
xgb_preds = np.expm1(model_xgb.predict(X_test)) lasso_preds = np.expm1(model_lasso.predict(X_test)) predictions = pd.DataFrame({"xgb":xgb_preds, "lasso":lasso_preds}) predictions.plot(x = "xgb", y = "lasso", kind = "scatter") preds = 0.7*lasso_preds + 0.3*xgb_preds solution = pd.DataFrame({"id":test.Id, "SalePrice":preds}) #solution.to_csv("lasso_xgb.csv", index = False) from keras.layers import Dense from keras.models import Sequential from keras.regularizers import l1 from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split X_train = StandardScaler().fit_transform(X_train) X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, random_state = 3) print(X_tr.shape) X_tr model = Sequential() #model.add(Dense(256, activation="relu", input_dim = X_train.shape[1])) model.add(Dense(1, input_dim = X_train.shape[1], W_regularizer=l1(0.001))) model.compile(loss = "mse", optimizer = "adam") model.summary() hist = model.fit(X_tr, y_tr, validation_data = (X_val, y_val)) pd.Series(model.predict(X_val)[:,0]).hist()
from flask import Flask, render_template, request import pickle from sklearn.preprocessing import StandardScaler app = Flask(__name__) classifier = pickle.load(open('score.pkl', 'rb')) @app.route('/', methods=['GET']) def home(): return render_template('index.html') standard_to = StandardScaler() @app.route("/predict", methods=['POST']) def predict(): if request.method == 'POST': grescore = int(request.form['grescore']) toeflscore = int(request.form['toeflscore']) rating = request.form['rating'] SOP = float(request.form['sop']) LOR = float(request.form['lor']) CGPA = float(request.form['cgpa']) Research = request.form['Research'] if Research == 'yes': Research = 1 else: Research = 0
# import data train = get_feature_data() df_orig = get_original_data() # Whether to vectorize original data or not vectorise_text = True if vectorise_text == True: vectorized_desciption = vectorize_text_feature(train, 'Description',15) vectorized_title = vectorize_text_feature(train, 'Country', 5) train2 = train.drop(['InvoiceDate'],axis=1) train2 = pd.concat([train2, vectorized_desciption, vectorized_title], axis=1) else: train2 = train.drop(['InvoiceDate'],axis=1) # standardize data scaler = StandardScaler() scaler = scaler.fit(train2) train_scaled = scaler.transform(train2) # use principle component analysis to reduce dimensions pca = PCA(n_components=20) pca = pca.fit(train_scaled) train_reduced = pca.transform(train_scaled) # calculate matrix of cosine similarity distances = cosine_similarity(train_reduced).T # find top 5 closest ads top_5_orig = [] top_distances = [] for i in range(len(train_reduced)):
def bayesian_regression_modeling(df, label_col, target_col, prior_distribution_list, draw_sample=1000, chains=2, scaling_opt=3): """ :param df: :param label_col: :param target_col: :param model_option: :param draw_sample: :param chains: :param alpha_1: :param alpha_2: :param lambda_1: :param lambda_2: :return: MCMC mean trace array, MCMC visualization img source """ # test: using scaling n_individuals = len(df) print("scale@@@@@",df) if scaling_opt == 1: df[df.columns] = StandardScaler().fit_transform(df[df.columns]) elif scaling_opt == 2: df[df.columns] = MinMaxScaler().fit_transform(df[df.columns]) elif scaling_opt == 3: df[df.columns] = df[df.columns] feature_list = df.columns feature_list = [i for i in feature_list if i != label_col] print(feature_list) print("Model datasett:BHJLK") print(df) # Using PyMC3 # formula = str(target_col)+' ~ '+' + '.join(['%s' % variable for variable in label_col]) # degree of freedom # nu = len(df[label_col].count(axis=1)) - len(df[label_col].count(axis=0)) # TODO: add student-T distribution as priors for each feature # get degree of freedom if len(df[label_col].count(axis=1)) >= len(df[label_col].count(axis=0)): nu = len(df[label_col].count(axis=1)) - len(df[label_col].count(axis=0)) else: nu = 0 print("Degree of Freedom:") print(nu) # with pm.Model() as normal_model: # start = time.time() # # # intercept = pm.StudentT('Intercept', nu=nu, mu=0, sigma=1) # # sigma = pm.HalfCauchy("sigma", beta=10, testval=1.) # # pass in a prior mean & coefficient list # # # family = pm.glm.families.Normal() # pm.GLM.from_formula(formula, data=df, family=family) # trace = pm.sample(draws=draw_sample, chains=chains, tune=500, random_seed=23) # end = time.time() # print("Time elasped: {} seconds.".format(end-start)) # print("DONE normal_trace") with pm.Model() as model: fea_list = [variable for variable in label_col] print(fea_list) pm_list = [] i = 0 for prior_dist in prior_distribution_list: # for j in range(len(fea_list)): for type,prior in prior_dist.items(): if type != "Target" and prior == "Normal": # print(fea_list[j]) pm_list.append(pm.Normal(str(fea_list[i]))) i += 1 elif type != "Target" and prior == "Student T": pm_list.append(pm.StudentT(str(fea_list[i]), nu=nu)) i += 1 elif type != "Target" and prior == "Skew Normal": pm_list.append(pm.SkewNormal(str(fea_list[i]))) i += 1 # for i, item in enumerate(prior_distribution_list):Skew Normal # setattr(sys.modules[__name__], 'beta{0}'.format(i), item) # hyper_sigma = pm.HalfNormal('hyper_sigma', sd=3) sigma = pm.HalfCauchy('sigma', beta=10, testval=1.) intercept = pm.Normal('Intercept', 0, sigma=20) # setting the distribution mean for the predictor mu = intercept print("MUUU") print(mu) for i in range(len(pm_list)): print("PM LIST i") print(pm_list[i]) print("DF[FEA_LIST]") print(df[fea_list[i]]) mu += pm_list[i] * df[fea_list[i]].to_numpy() #mu += pm_list[i] * np.ones(df[fea_list[i]].to_list()) print(df[fea_list[i]].to_numpy()) for prior_dist in prior_distribution_list: for type,prior in prior_dist.items(): if type == "Target" and prior == "Normal": print("Target Normal") likelihood = pm.Normal(str(target_col), mu=mu, sigma=sigma, observed=df[target_col]) elif type == "Target" and prior == "Student T": print("Target Student") likelihood = pm.StudentT(str(target_col), nu = nu , mu=mu, sd=sigma, shape = n_individuals) elif type == "Target" and prior == "Skew Normal": print("Target Skew") mu = pm.Uniform('lambda_bl', 0., draw_sample) likelihood = pm.SkewNormal(str(target_col),mu=mu, sigma=sigma,tau=None, alpha=1, sd=3) elif type == "Target" and prior == nan: print("Target Nan") likelihood = pm.Normal(str(target_col), mu=mu, sigma=sigma, observed=df[target_col]) #means = pm.StudentT('means', nu = nu, mu = hyper_mean, sd = hyper_sigma, shape = n_individuals) #SkewNormal(mu=0.0, sigma=None, tau=None, alpha=1, sd=None, *args, **kwargs) trace = pm.sample(draws=draw_sample, chains=chains, random_seed=23,progressbar=True) img_source = save_mat_fig(trace, gtype='traceplot') posterior_dist = save_mat_fig(trace, gtype='posterior') # return np.array([np.mean(trace[variable]) for variable in trace.varnames]), img_source, posterior_dist return trace, img_source, posterior_dist
def preprocess_data(X, scaler=None): if not scaler: scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) return X, scaler
# Format the features and labels for use with scikit learn feature_list = [] label_list = [] for item in training_set: if np.isnan(item[0]).sum() < 1: feature_list.append(item[0]) label_list.append(item[1]) print('Features in Training Set: {}'.format(len(training_set))) print('Invalid Features in Training set: {}'.format( len(training_set) - len(feature_list))) X = np.array(feature_list) # Fit a per-column scaler X_scaler = StandardScaler().fit(X) # Apply the scaler to X X_train = X_scaler.transform(X) y_train = np.array(label_list) # Convert label strings to numerical encoding encoder = LabelEncoder() y_train = encoder.fit_transform(y_train) # Create classifier clf = svm.SVC(kernel='linear') # Set up 5-fold cross-validation kf = cross_validation.KFold(len(X_train), n_folds=5, shuffle=True,
plt.style.use('seaborn-deep') import matplotlib.cm cmap = matplotlib.cm.get_cmap('plasma') # Reading in data ds = pd.read_csv("Social_Network_Ads.csv") X = ds.iloc[:, 2:4].values y = ds.iloc[:, 4].values # Splitting and scaling from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler X_train, X_test, y_train, y_test = train_test_split(X, y) sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.fit_transform(X_test) # Classifier from sklearn.svm import SVC from sklearn.metrics import confusion_matrix clf = SVC(kernel='rbf', C=20) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) accuracy = (cm[0][0] + cm[1][1]) / sum(sum(cm)) # Cross validation
def __init__(self, columns=None, **kwargs): self.columns = columns self.model = StandardScaler(**kwargs) self.transform_cols = None
import pandas as pd import plotly.figure_factory as ff from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.cluster import AgglomerativeClustering import hvplot.pandas # %% df = pd.read_csv('new_iris_data.csv') # %% # Standardized first iris_scale = StandardScaler().fit_transform(df) # apply PCA to reduce to 2 pricipal components pca_model = PCA(n_components=2, random_state=0) iris_pca = pca_model.fit_transform(iris_scale) print(pca_model.explained_variance_ratio_) # %% iris_pca_df = pd.DataFrame(data=iris_pca, columns=['PC_1', 'PC_2']) iris_pca_df.head(-5) # %% # creating a dendrogram using plotly.figure_factory fig = ff.create_dendrogram(iris_pca_df, color_threshold=0) fig.update_layout(width=800, height=500) fig.show() # the higher the horizontal lines, the less similarity there is between the clusters.