Exemple #1
0
def lda_training(X, y):
    """
    Linear Discriminant Analysis model training. Estimates the test error and computes the training error.
    :param X:
    :param y:
    :return:
    """
    estimator = LinearDiscriminantAnalysis()
    estimated_test_error = estimate_test_error(estimator, X, y)
    print("Estimated test error for model {} :\n\t{}".format(
        estimator.get_params(), estimated_test_error))
    current_model = LinearDiscriminantAnalysis()
    current_model.fit(X, y)
    y_pred = current_model.predict(X)
    error = roc_auc_score(y, y_pred)
    print("Training error for model {} :\n\t{}".format(
        current_model.get_params(), error))
    return current_model
Exemple #2
0
def iris_data():
    # define dataset
    #X, y = make_classification(n_samples=1000, n_features=10, n_informative=10, n_redundant=0, random_state=1)
    X, y = load_iris(True)
    print(X, y)
    # define model
    model = LinearDiscriminantAnalysis()
    print(model.get_params(True))
    #simple_prediction(X, y, model)
    solvers = ["svd", "lsqr", "eigen"]
    row = [0.1, 3.5, 4.2, 100]
    for solver in solvers:
        result = hyper_parameters(X, y, model, solver)
        pr_class = result.predict([[10, 25, 30, 40]])
        print(pr_class)
Exemple #3
0
def LDA_as_reduction():
    X, y = make_classification(n_samples=10,
                               n_features=6,
                               n_informative=6,
                               n_redundant=0,
                               random_state=2,
                               n_classes=3)
    model = LinearDiscriminantAnalysis(n_components=2)
    print(X, y)
    print(model.get_params(True))
    model.fit(X, y)
    print(model.predict([[2, 4, 5, -1, 0, 4]]))
    X_trans = model.transform(X)
    print(X_trans)
    model.fit(X_trans, y)
    print(model.predict([[2, 4]]))
    print(model.predict([[-3, 5]]))
line = ax.plot(xx, yy, color='black', linewidth=2)
plt.scatter(xp1, yp1, color='red', s=3)
plt.scatter(xp2, yp2, color='green', s=3)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Fisher Discriminant Analysis')
plt.show()

print('Accuracy Score of Fisher Discriminant Analysis: %f' %
      (accuracy_score(y_test, pred)))
# LDA方法
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train, y_train)
pred = []
pred = LDA.predict(X_test)
para = LDA.get_params()

w1 = np.dot(np.linalg.inv(cov_m), (avg1 - avg2).T)
w0 = -0.5 * np.dot((avg1 + avg2).T, w1)

xx = np.arange(-10, 40, 0.01)
yy = -xx * w1[0] / w1[1] - w0

# 预测结果为第一类的点
xp1 = X_test.iloc[list(np.where(np.array(pred) == 1)[0]), 0]
yp1 = X_test.iloc[list(np.where(np.array(pred) == 1)[0]), 1]

# 预测结果为第二类的点
xp2 = X_test.iloc[list(np.where(np.array(pred) == 2)[0]), 0]
yp2 = X_test.iloc[list(np.where(np.array(pred) == 2)[0]), 1]
Exemple #5
0
y = diagnostic[:trainingSetLength,
               1:]  # target values (i.e. expected output for X)

for i in range(len(y)):
    y[i] = int(y[i])
y = np.transpose(y).astype('int')

trainingSet = extractedFeatures[:trainingSetLength]

lda = LinearDiscriminantAnalysis()

lda.fit(trainingSet, y[0])
# letting the algorithm know which sample in X belongs to which class labelled in y

# save the params to disk
lda_params = lda.get_params()
params_lda = 'params_lda.sav'

# save the model to disk
filename_lda = 'lda_model.sav'

#testSet=extractedFeatures[trainingSetLength:trainingSetLength+10]
#prediction=lda.predict(testSet)

pickle.dump(lda, open(filename_lda, 'wb'))
pickle.dump(lda_params, open(params_lda, 'wb'))

#%%TEST CLASSIFICATION - QDA
excelAddress = 'C:\\Users\\theor\\Downloads\\Ground_truth_ISIC_1.xlsx'
trainingSetLength = 500
Exemple #6
0
x_nd = d_no_dummies.drop(['salary_bin'], axis=1).values.astype(float)
x_numeric = d_no_dummies[['age', 'fnlwgt', 'education_nbr', 'capital_gain', 'capital_loss', 'hours_per_week']].values.astype(float)

#Split data into training and test sets - be sure to stratify since this is for classification
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=seed)
x_nd_train, x_nd_test, y_nd_train, y_nd_test = train_test_split(x_nd, y, test_size=0.3, stratify=y, random_state=seed)
xn_train, xn_test, yn_train, yn_test = train_test_split(x_numeric, y, test_size=0.3, stratify=y, random_state=seed)

######
#LDA

#Run LDA for classification
#Note if n_components=None, then all of them are kept
lda = LinearDiscriminantAnalysis(n_components=None, solver='svd')
lda.fit(x_train, y_train)
print(lda.get_params())
print('Priors:', lda.priors_)       #Class prior probabilities
print('Classification Accuracy:', lda.score(x_train, y_train))

#Explore the percentage of between class variance explained by each linear discriminant
print('Explained variance:', lda.explained_variance_ratio_)

######
#Evaluating the model on new data

#Make income predictions for validation set
post_lda = lda.predict(x_test)
post_lda = post_lda.reshape(post_lda.shape[0], 1)
print('Classification Accuracy:', lda.score(x_test, y_test))

#Confusion matrix
Exemple #7
0
#ETAPA XX: APLICAÇÃO DO LDA

#Bloco 01: Aplicação do LDA 

#Variance Caused by Each of the Principal Components

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA()

lda.get_params().keys()
 
X_train_lda_new = lda.fit_transform(X_train_lda, Y_train)
X_test_lda_new = lda.transform(X_test_lda)

print('Original Number of Features:', X_train_lda.shape[1])
print('Reduced Number of Features:', X_train_lda_new.shape[1])

print('Original Number of Features:', X_test_lda.shape[1])
print('Reduced Number of Features:', X_test_lda_new.shape[1])

from pprint import pprint
print('Parameters Currently In Use:\n')
pprint(lda.get_params())

explained_variance_lda = lda.explained_variance_ratio_ 

for i in explained_variance_pca:
    print(format(i*100, 'f'))

plt.figure(1, figsize = (14, 7))
triples = np.zeros((len(scaled_df_x), len(scaled_df_x), 3))
triple_array = []
labeled_array = []
for i in range(len(scaled_df_x)):
    for j in range(len(scaled_df_x)):
        triples[i, j, 0] = scaled_df_x[i][j]
        triples[i, j, 1] = scaled_df_y[i][j]
        triples[i, j, 2] = scaled_df_z[i][j]
        triple_array.append(
            [scaled_df_x[i][j], scaled_df_y[i][j], scaled_df_z[i][j]])
        labeled_array.append(labeled_mat[i][j])
clf = LinearDiscriminantAnalysis(store_covariance=True)
clf.fit(triple_array, labeled_array)
plt.figure()
score = clf.score(triple_array, labeled_array)
params = clf.get_params()
print "Accuracy:", score
print "coef:", clf.coef_
print "Covariance matrix:", clf.covariance_
print "Explained Variance Ratio:", clf.explained_variance_ratio_
print "Means:", clf.means_
print params

pair_set = set()
for i in range(len(clf.coef_)):
    for j in range(len(clf.coef_)):
        if (i, j) not in pair_set and (j, i) not in pair_set and i != j:
            pair_set.add((i, j))
            x_difference = (clf.coef_[i][0] - clf.coef_[j][0])**2
            y_difference = (clf.coef_[i][1] - clf.coef_[j][1])**2
            z_difference = (clf.coef_[i][2] - clf.coef_[j][2])**2
Exemple #9
0
    recall = cross_val_score(model,
                             X_train,
                             y_train,
                             cv=kfold,
                             scoring='recall').mean()
    f1_score = cross_val_score(model, X, y, cv=kfold,
                               scoring='f1_weighted').mean()
    auc_score = cross_val_score(model, X, y, cv=kfold,
                                scoring='roc_auc').mean()

    delta = time.time() - start_time
    print('{}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f} secs'.format(
        name, accuracy, precision, recall, f1_score, auc_score, delta))

clf = LinearDiscriminantAnalysis(store_covariance=True).fit(X_train, y_train)
print(clf.get_params(deep=True))
print(X_train.head())
X2 = clf.transform(X)
print(X2.head())
"""
with open(input_file, "r") as fr:
    n = int(fr.readline())
    for i in range(0, n):
        a, b = map(int, fr.readline().strip().split())
        print(a, b)

output_file = "D:\\Container\\Python Projects\\2020-09-30 Yandex ML Contest\\output_A.txt"
if (yandex): output_file = "output.txt"

with open(output_file, "w") as fw:
    fw.write(str.format("{0:.6f}", a))
      my_ground_truth == my_pred)

# In[103]:

#Let's use Sklearn to see if our solution is correct
#Using sklearn
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()
clf.fit(new_data[:, 1:], labels)
LinearDiscriminantAnalysis(n_components=None,
                           priors=None,
                           shrinkage=None,
                           solver='eigen',
                           store_covariance=False,
                           tol=0.0001)
print(clf.get_params())
predictions = clf.predict(new_data[:, 1:])
print(predictions)
errors = sum(labels != predictions)
error_rate = (n_errors / len(predictions) * 100)
print("Error Rate is: ", error_rate, "%")
print("\nAs can be seen, our solution is right!")

# 2. Consider the Logistic Regression as discussed in the class. Assume now that the cost of erring an observation from class 1 is cost1 and the cost of erring observations from class 0 is cost0. How would you modify the goal function, gradient and hessian matrix (slides 11 and 12 in week 5)?
#
# Change the code provided (or developed by you) in the class to receive as input the vector of costs. Test your code with the following script:
#
# trainC1 = mvnrnd([21 21], [1 0; 0 1], 1000);
#
# trainC0 = mvnrnd([23 23], [1 0; 0 1], 20);
#
Exemple #11
0
class LDA(object):
    def __init__(self,
                 solver="svd",
                 shrinkage=None,
                 priors=None,
                 n_components=None,
                 store_covariance=False,
                 tol=1e-4):
        """
        :param solver: string, 可选项,"svd","lsqr", "eigen"。 默认使用svd, 不计算协方差矩阵,适用于大量特征
        的数据, 最小二乘 lsqr, 结合shrinkage 使用。 eigen 特征值分解, 集合shrinkage  使用
        :param shrinkage: str/float 可选项,概率值,默认为None, "auto", 自动收缩, 0到1内的float, 固定的收缩参数
        :param priors: array, optional, shape (n_classes,) 分类优先
        :param n_components:  # 分量数, 默认None, int, 可选项
        :param store_covariance:  bool, 可选项, 只用于”svd“ 额外计算分类协方差矩阵
        :param tol: 浮点型,默认1e-4, 在svd 中,用于排序评估的阈值
        """
        self.model = LinearDiscriminantAnalysis(
            solver=solver,
            shrinkage=shrinkage,
            priors=priors,
            n_components=n_components,
            store_covariance=store_covariance,
            tol=tol)

    def fit(self, x, y):
        self.model.fit(X=x, y=y)

    def transform(self, x):
        return self.model.transform(X=x)

    def fit_transform(self, x, y):
        return self.model.fit_transform(X=x, y=y)

    def get_params(self, deep=True):
        return self.model.get_params(deep=deep)

    def set_params(self, **params):
        self.model.set_params(**params)

    def decision_function(self, x):
        self.model.decision_function(X=x)

    def predict(self, x):
        return self.model.predict(X=x)

    def predict_log_proba(self, x):
        return self.model.predict_log_proba(X=x)

    def predict_proba(self, x):
        return self.model.predict_proba(X=x)

    def score(self, x, y, sample_weight):
        return self.model.score(X=x, y=y, sample_weight=sample_weight)

    def get_attributes(self):  # 生成模型之后才能获取相关属性值
        coef = self.model.coef_  # 权重向量,
        intercept = self.model.intercept_  # 截距项
        covariance = self.model.covariance_  # 协方差矩阵
        explained_variance_ratio = self.model.explained_variance_ratio_
        means = self.model.means_
        priors = self.model.priors_  # 分类等级, 求和为1 shape (n_classes)
        scalings = self.model.scalings_  # shape(rank,n_classes-1). 缩放
        xbar = self.model.xbar_  # 所有的均值
        classes = self.model.classes_  # 分类标签

        return coef, intercept, covariance, explained_variance_ratio, means, priors, scalings, xbar, classes