vectC = CountVectorizer(binary=True)
vectC_train = vectC.fit_transform(X_train)
vectC_test = vectC.transform(X_test)

# vectorizer = HashingVectorizer()
vectH = HashingVectorizer()
vectH_train = vectH.fit_transform(X_train)
vectH_test = vectH.transform(X_test)

# vectorizer = HashingVectorizer()
vectTfid = TfidfVectorizer()
vectTfid_train = vectTfid.fit_transform(X_train)
vectTfid_test = vectTfid.transform(X_test)

# using a robust scaler
scale = RS()
fits = scale.fit(X)
rs = pd.DataFrame(fits.transform(X))
rs['income'] = y
robust = rs.dropna(subset=['income'])

# making testing and training sets
robust_train, robust_test = train_test_split(robust)
robust_train_X = robust_train.drop('income', axis=1)
robust_train_y = robust_train['income']

# =============================================================================
# CLASSIFICATIONS
# =============================================================================

#from sklearn.linear_model import LinearRegression as LIN
#Train test split

from sklearn.model_selection import train_test_split as tts
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size=0.10, shuffle=True)

print("Training length:" + repr(len(X_train)))
print("Testing length:" + repr(len(X_test)))
print("Training length:" + repr(len(Y_train)))
print("Testing length:" + repr(len(Y_test)))

# In[9]:

#Feature scaling

from sklearn.preprocessing import RobustScaler as RS
scaler = RS()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# In[44]:

#Training

from sklearn.neighbors import KNeighborsClassifier as KNC

classifier = KNC(n_neighbors=21)
classifier.fit(X_train, Y_train)

# In[45]:
# =============================================================================
# loading the iris dataset
datas = datasets.load_iris()
X = datas.data
y = datas.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=2)

from sklearn.preprocessing import RobustScaler as RS
from sklearn.preprocessing import PolynomialFeatures as POLY
from sklearn.preprocessing import OneHotEncoder as ONEHOT

# ROBUST SCALER
scaleRS = RS()
fitsRS = scaleRS.fit(X)
rs = pd.DataFrame(fitsRS.transform(X))
rs['income'] = y
robust = rs.dropna(subset=['income'])
robust_train, robust_test = train_test_split(robust)
robust_train_X = robust_train.drop('income', axis=1)
robust_train_y = robust_train['income']

# POLYNOMIAL TRANSFORM
tranPOLY = POLY()
fitsPOLY = tranPOLY.fit(X)
poly = pd.DataFrame(fitsPOLY.transform(X))
poly['income'] = y
polys = poly.dropna(subset=['income'])
polys_train, polys_test = train_test_split(polys)
Exemple #4
0
wine_df.dtypes

# Use a command to retrieve the mean, median, 1st and 3rd quartiles of each column. What is the mean of 'malic_acid', what is the max of magnesium?
wine_df.describe()

# Plot box blots of all features for each class, which value has the highest median? The most variance?
wine_df.boxplot(rot=90)

# Use RobustScalar to put all features on a similar scale. Replot the box plots.
from sklearn.preprocessing import RobustScaler as RS
from sklearn.model_selection import train_test_split

X = wine_data.data
y = wine_data.target

scaleRS = RS()
fitsRS = scaleRS.fit(X)
wine_rs = pd.DataFrame(fitsRS.transform(X))
wine_rs['target'] = y

wine_rs.boxplot()

# Do a scatter plot of flavanoids vs hue with each target value being a different color. How separated is the data?
wine_df.plot.scatter(x='hue',y='flavanoids',c='target')

# Do a NMF decomposition (2 components) of the data. What features contribute the most to each component? 
from sklearn.decomposition import NMF

nmf = NMF(n_components=2, init='random', random_state=0)
W = nmf.fit_transform(X)
H = nmf.components_
Exemple #5
0
# loading the iris dataset, splitting into testing and training sets
datas = datasets.load_iris()
X = datas.data
y = datas.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=2)

#log = LOG()
#poly = POLY(3)
#scale = RS()

#Train a logistic regression model with a polynomilal transform and a robust scalar
pipeline = Pipeline(steps=[('rs', RS()), ('poly',
                                          POLY(degree=2)), ('logistic',
                                                            LOG())])

pipeline.fit(X_train, y_train)

log_msq = msq(pipeline.predict(X_test), y_test)
log_r2 = r2(pipeline.predict(X_test), y_test)
print('\nThe mean squared error of the Logistic Regression model is: \t\t%s' %
      log_msq)
print('The R2 score of the Logistic Regression model is: \t\t\t%s' % log_r2)

#pipe = make_pipeline(TfidfVectorizer(), LogisticRegression())
parameters = {
    'poly__degree': [1, 2, 5, 10],
    'logistic__C': [1, 2, 5, 10],
Exemple #6
0
def Robust_norm(arr):
    scalerRS = RS()
    scalerRS.fit(arr)
    arrRS = scalerRS.transform(arr)
    
    return arrRS
Exemple #7
0
 def __init__(self, **kwargs):
     r"""Initialize RobustScaler.
     """
     self._params = dict(with_centering=ParameterDefinition([True, False]),
                         with_scaling=ParameterDefinition([True, False]))
     self.__robust_scaler = RS()