vectC = CountVectorizer(binary=True) vectC_train = vectC.fit_transform(X_train) vectC_test = vectC.transform(X_test) # vectorizer = HashingVectorizer() vectH = HashingVectorizer() vectH_train = vectH.fit_transform(X_train) vectH_test = vectH.transform(X_test) # vectorizer = HashingVectorizer() vectTfid = TfidfVectorizer() vectTfid_train = vectTfid.fit_transform(X_train) vectTfid_test = vectTfid.transform(X_test) # using a robust scaler scale = RS() fits = scale.fit(X) rs = pd.DataFrame(fits.transform(X)) rs['income'] = y robust = rs.dropna(subset=['income']) # making testing and training sets robust_train, robust_test = train_test_split(robust) robust_train_X = robust_train.drop('income', axis=1) robust_train_y = robust_train['income'] # ============================================================================= # CLASSIFICATIONS # ============================================================================= #from sklearn.linear_model import LinearRegression as LIN
#Train test split from sklearn.model_selection import train_test_split as tts X_train, X_test, Y_train, Y_test = tts(X, Y, test_size=0.10, shuffle=True) print("Training length:" + repr(len(X_train))) print("Testing length:" + repr(len(X_test))) print("Training length:" + repr(len(Y_train))) print("Testing length:" + repr(len(Y_test))) # In[9]: #Feature scaling from sklearn.preprocessing import RobustScaler as RS scaler = RS() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # In[44]: #Training from sklearn.neighbors import KNeighborsClassifier as KNC classifier = KNC(n_neighbors=21) classifier.fit(X_train, Y_train) # In[45]:
# ============================================================================= # loading the iris dataset datas = datasets.load_iris() X = datas.data y = datas.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2) from sklearn.preprocessing import RobustScaler as RS from sklearn.preprocessing import PolynomialFeatures as POLY from sklearn.preprocessing import OneHotEncoder as ONEHOT # ROBUST SCALER scaleRS = RS() fitsRS = scaleRS.fit(X) rs = pd.DataFrame(fitsRS.transform(X)) rs['income'] = y robust = rs.dropna(subset=['income']) robust_train, robust_test = train_test_split(robust) robust_train_X = robust_train.drop('income', axis=1) robust_train_y = robust_train['income'] # POLYNOMIAL TRANSFORM tranPOLY = POLY() fitsPOLY = tranPOLY.fit(X) poly = pd.DataFrame(fitsPOLY.transform(X)) poly['income'] = y polys = poly.dropna(subset=['income']) polys_train, polys_test = train_test_split(polys)
wine_df.dtypes # Use a command to retrieve the mean, median, 1st and 3rd quartiles of each column. What is the mean of 'malic_acid', what is the max of magnesium? wine_df.describe() # Plot box blots of all features for each class, which value has the highest median? The most variance? wine_df.boxplot(rot=90) # Use RobustScalar to put all features on a similar scale. Replot the box plots. from sklearn.preprocessing import RobustScaler as RS from sklearn.model_selection import train_test_split X = wine_data.data y = wine_data.target scaleRS = RS() fitsRS = scaleRS.fit(X) wine_rs = pd.DataFrame(fitsRS.transform(X)) wine_rs['target'] = y wine_rs.boxplot() # Do a scatter plot of flavanoids vs hue with each target value being a different color. How separated is the data? wine_df.plot.scatter(x='hue',y='flavanoids',c='target') # Do a NMF decomposition (2 components) of the data. What features contribute the most to each component? from sklearn.decomposition import NMF nmf = NMF(n_components=2, init='random', random_state=0) W = nmf.fit_transform(X) H = nmf.components_
# loading the iris dataset, splitting into testing and training sets datas = datasets.load_iris() X = datas.data y = datas.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2) #log = LOG() #poly = POLY(3) #scale = RS() #Train a logistic regression model with a polynomilal transform and a robust scalar pipeline = Pipeline(steps=[('rs', RS()), ('poly', POLY(degree=2)), ('logistic', LOG())]) pipeline.fit(X_train, y_train) log_msq = msq(pipeline.predict(X_test), y_test) log_r2 = r2(pipeline.predict(X_test), y_test) print('\nThe mean squared error of the Logistic Regression model is: \t\t%s' % log_msq) print('The R2 score of the Logistic Regression model is: \t\t\t%s' % log_r2) #pipe = make_pipeline(TfidfVectorizer(), LogisticRegression()) parameters = { 'poly__degree': [1, 2, 5, 10], 'logistic__C': [1, 2, 5, 10],
def Robust_norm(arr): scalerRS = RS() scalerRS.fit(arr) arrRS = scalerRS.transform(arr) return arrRS
def __init__(self, **kwargs): r"""Initialize RobustScaler. """ self._params = dict(with_centering=ParameterDefinition([True, False]), with_scaling=ParameterDefinition([True, False])) self.__robust_scaler = RS()