# standardize.
from sklearn.preprocessing import StandardScaler
cols = X.columns
X = StandardScaler().fit_transform(X)

# Normalizer won't work with NAs, so this is a good time to fill them in.

# if this is after iteration 0, use kmeans clustering to fill them.
if ('iterationnum' not in globals()) or (iterationnum >= 1):

    labels, centroids, X = kmeans_missing(X, n_clusters=20, max_iter=10)
    del labels, centroids

# if not, drop them.
else:

    X = pd.DataFrame(X)
    nas = X.isnull().any(axis=1)
    X = X[~nas]
    y = y[~nas]
    winmults = winmults[~nas]
    del nas

# normalize.
from sklearn.preprocessing import Normalizer
X = Normalizer().fit_transform(X)

save('../out/d2-fight-level-standardize-normalize-kmeansNA.pkl', X, y, cols,
     winmults)

del X, y, cols  #, labels, centroids
Example #2
0
          'pl_bmassj','pl_radj','st_mass', 'st_teff', 'st_rad', 'st_metfe']
df = df.loc[:, prop]
df = df.dropna(subset = prop, how = 'any', axis = 0)

pl_prop = ['pl_orbper','pl_orbsmax','pl_orbeccen',
          'pl_bmassj','pl_radj']
X = df.loc[:, pl_prop]
X = StandardScaler().fit_transform(X.values)
X = pd.DataFrame(X, columns = pl_prop)

st_prop = ['st_mass', 'st_teff', 'st_rad', 'st_metfe']
y = df.loc[:, st_prop] 
y = StandardScaler().fit_transform(y.values)      
y = pd.DataFrame(y, columns = st_prop ) 

y.isnull().sum()


sns.regplot(df['st_mass'], df['pl_bmassj'])



sns.set(font='serif', font_scale=1.4, style='ticks')
palette = sns.hls_palette(8, l=.3, s=.8)
pal = palette.as_hex()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(y,X,train_size = 0.8)