コード例 #1
0
def testSelectedFeatures1():
    print("start testAdditionlSquaredFeatures()")
    LRModel = lr.LogisticRegression(0.001, 500)
    LDAModel = LDA.LDA()
    data1 = genRWNormalized()
    data2 = np.append(data1[:, [10, 1, 9, 6]],
                      np.array([data1[:, -1]]).T,
                      axis=1)
    data3 = addSquareFeature(data1, [10, 1, 9, 6])
    a1 = 0
    b1 = 0
    a2 = 0
    b2 = 0
    a3 = 0
    b3 = 0
    for i in range(3):
        np.random.shuffle(data1)
        np.random.shuffle(data2)
        np.random.shuffle(data3)
        a1 += LRKFoldValidation(LRModel, data1, 5)
        b1 += LDAKFoldValidation(LDAModel, data2, 5)
        a2 += LRKFoldValidation(LRModel, data2, 5)
        b2 += LDAKFoldValidation(LDAModel, data2, 5)
        a3 += LRKFoldValidation(LRModel, data3, 5)
        b3 += LDAKFoldValidation(LDAModel, data3, 5)
    print("Accuracy for lr in rw is {}".format(a1 / 3))
    print("Accuracy for LDA in rw is {}".format(b1 / 3))
    print("Accuracy for lr in rw is {}".format(a2 / 3))
    print("Accuracy for LDA in rw is {}".format(b2 / 3))
    print("Accuracy for lr in rw is {}".format(a3 / 3))
    print("Accuracy for LDA in rw is {}".format(b3 / 3))
コード例 #2
0
def main():
    filename = '../resource/train.csv'
    itemid, numattr, cateattr, label = readfile(filename)
    totalnum = len(numattr)
    testnum = totalnum * 0.1
    testnum = int(testnum)
    trainnum = totalnum - testnum
    trainnumattr = numattr[0: trainnum]
    traincateattr = cateattr[0: trainnum]
    trainlabel = label[0: trainnum]
    testnumattr = numattr[trainnum:]
    testcateattr = cateattr[trainnum:]
    testlabel = label[trainnum:]
    multidim = MultiDimension(traincateattr)
    trainextattr = multidim.gettrainextattr()
    testextattr = multidim.gettestextattr(testcateattr)
    trainattr = append(trainnumattr, trainextattr, axis = 1)
    testattr = append(testnumattr, testextattr, axis = 1)
    LDAcoe = LDA(trainattr, trainlabel)
    LDAtrainattr = conpress(trainattr, LDAcoe)
    LDAtestattr = conpress(testattr, LDAcoe)
    for i in range(20):
        print LDAtrainattr[i]
    import sys
    sys.exit(1)
    model = WeightedModel(LDAtrainattr, trainlabel)
    right = 0
    for i in range(testnum):
        p = model.predict(LDAtestattr[i])
        if p == testlabel[i]:
            right += 1
    accuracy = float(right) / testnum
    print 'accuracy:', accuracy
コード例 #3
0
def three():
    app = Flask(__name__)
    app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True

    result = LDA.LDA(10)  ##문서 10개 돌림
    # print
    return json.dumps(result, ensure_ascii=False)
コード例 #4
0
    def __get_topic(self):
        """
            returns a dictionary of (hashtag: topic) attributes using Latent Dirichlet Allocation
        """
        tweet_topic = {}
        tweet_data = []
        for tweet in self.tweets:
            tweet_topic[tweet["id_str"]] = ""

            text = self.get_tweet_text(tweet)
            tweet_data.append((text, tweet["id_str"]))

        lda = LDA.LDA(tweet_data)

        for tweet in self.tweets:
            text = self.get_tweet_text(tweet)
            tweet_topic[tweet["id_str"]] = lda.predict_with_bag(text)
            # tweet_topic[tweet["id_str"]] = lda.predict_with_tf_idf(text)

        hashtag_topic = {}
        for hashtag in self.hashtags:
            hashtag_topic[hashtag["text"]] = []

        for tweet, hashtagList in self.tweet_hashtag_map.items():
            for hashtag in hashtagList:
                hashtag_topic[hashtag["text"]].append(tweet_topic[tweet])

        #hashtag topic is the the topic of the majority of hashtag's tweets
        hashtag_topic = {hashtag: self.most_common(l) for hashtag, l in
                             hashtag_topic.items()}
        return hashtag_topic
コード例 #5
0
def testLDAWithWine():
    data = genDataWOHeader(file_path1)
    qualityToCategory(data)
    np.random.shuffle(data)
    #data1= removeOutLiersByND(data2)
    testSet, trainSet = seperateTestSet(data)
    aModel = LDA.LDA()
    return LDAKFoldValidation(aModel, trainSet, 5)
コード例 #6
0
def testLDAWithCancer():
    data = genData(file_path2)
    classToCategory(data)
    preprocessData(data)
    np.random.shuffle(data)
    #data1= removeOutLiersByND(data2)
    testSet, trainSet = seperateTestSet(data)
    aModel = LDA.LDA()
    return LDAKFoldValidation(aModel, trainSet, 5)
コード例 #7
0
def RunTrainLDA(infile, pcaFile, ldaFile):

    import cPickle

    fp = open(infile, "r")
    dataset = cPickle.load(fp)
    subjID = cPickle.load(fp)
    fp.close()

    pca = PCA(dataset)
    pca_proj = pca.compute()

    np.save(pcaFile, pca_proj)

    lda_proj = []
    lda = LDA(dataset, subjID, pca_proj)
    projData = lda.projectData()
    lda_proj = lda.train(projData)

    np.save(ldaFile, lda_proj)
コード例 #8
0
    def _init_trans_mat(self):
        # Check input
        if any([x is None for x in [self.X, self.labels, self.d]]):
            raise ValueError('X, labels and subdim not set!')

        num_pts = self.X.shape[0]
        D = self.X.shape[1]
        subdim = self.d

        # Setup random state
        prng = RandomState()
        if self._SEED is not None:
            prng = RandomState(self._SEED)
            if self._verbose:
                print("Setting random seed to", self._SEED)

        if self._init_method == "PCA":
            if num_pts < self.d:
                raise ValueError('num_pts < subdim')
            if self.d > D:
                raise ValueError('subdim > inputdim')

            pca = PCA(n_components=subdim, whiten=False)
            pca.fit(self.X)
            L = pca.components_.T + 1E-6

        elif self._init_method == "LDA":
            if self.d > D:
                raise ValueError('subdim > inputdim')

            lda_obj = LDA.LDA(self.X, self.labels)
            lda_obj.compute(dim=self.d)
            L = lda_obj.getTransform()
            L = L * (1. / LA.norm(L, ord=1, axis=1)).reshape(-1, 1)
        elif self._init_method == "randbeng":
            # L = 1. * bound * prng.rand(D, self.d) - bound
            L = np.random.normal(0,
                                 np.sqrt(2) / np.sqrt(self.D + self.d),
                                 (self.D, self.d))
        elif self._init_method == "randbest":
            # Do some random generation of matrices pick the one with lowest # of constraints
            if self._verbose:
                print('Doing random pre-gen L')
            t0 = timeit.default_timer()
            best_L = prng.rand(D, self.d)
            L = best_L
            self.loss_fun(best_L)
            # nconsts = self._count_active_constraints()
            bound = np.sqrt(6. / (D + self.d))
            best_N_consts = 1E10
            for i in range(0, 10):
                L = 1. * bound * prng.rand(D, self.d) - bound
                # L = 1E-5*prng.rand(D,self.d)
                # L = L * (1./LA.norm(L,ord=1,axiss=1)).reshape(-1,1)
                self.loss_fun(L)
                consts = self._count_active_constraints()
                if consts < best_N_consts:
                    best_N_consts = consts
                    best_L = copy.copy(L)
            L = copy.copy(best_L)
            if self._verbose:
                print("Pre-gen of L done. Took:",
                      "%3.3f" % (timeit.default_timer() - t0),
                      end=", ")
                print("# active const", best_N_consts, end=", ")

        elif self._init_method == "rand":
            # method_str = print('Doing random pre-gen Lapa')
            bound = np.sqrt(6. / (D + self.d))
            L = 1. * bound * prng.rand(D, self.d) - bound

        return L
コード例 #9
0
            x_n = x_n.reshape(-1, 1)
            p_ks = np.empty(len(self.unique_y))
        
            for j, k in enumerate(self.unique_y):
                p_x_given_y = self._mvn_density(x_n, self.mu_ks[j], self.Sigma)
                p_y_given_x = self.pi_ks[j]*p_x_given_y
                p_ks[j] = p_y_given_x
            
            y_n[i] = self.unique_y[np.argmax(p_ks)]
        
        return y_n
            

We fit the LDA model below and classify the training observations. As the output shows, we have 100% training accuracy.

lda = LDA()
lda.fit(X, y)
yhat = lda.classify(X)
np.mean(yhat == y)

The function below visualizes class predictions based on the input values for a model with $\bx_n \in \mathbb{R}^2$. To apply this function, we build a model with only two columns from the `wine` dataset. We see that the decision boundaries are linear, as we expect from LDA.

def graph_boundaries(X, model, model_title, n0 = 100, n1 = 100, figsize = (7, 5), label_every = 4):
        
        # Generate X for plotting 
        d0_range = np.linspace(X[:,0].min(), X[:,0].max(), n0)
        d1_range = np.linspace(X[:,1].min(), X[:,1].max(), n1)
        X_plot = np.array(np.meshgrid(d0_range, d1_range)).T.reshape(-1, 2)
        
        # Get class predictions
        y_plot = model.classify(X_plot).astype(int)
コード例 #10
0
ファイル: test.py プロジェクト: jrobertson5151/Jeopardy
import os
import pickle

from LDA import *
path = 'bbcsport'

docs = []
for (dirpath, dirnames, filenames) in os.walk(path):
    for f_name in filenames:
        with open(dirpath + '/' + f_name, 'r', encoding='latin-1') as txt_file:
            print(dirpath + '/' + f_name)
            data = txt_file.read().replace('\n', ' ')
            docs.append(data)

l = LDA(docs, K=5)
l.train(n_iterations=100)
l.pickle_LDA('pickledlda')
コード例 #11
0
ファイル: LDAmain.py プロジェクト: tanabe333/MLBook
# カテゴリの最小のデータ数
minNum = np.min([np.sum(Ytr==-1),np.sum(Ytr==1)])

# 各カテゴリのデータ
Xneg = Xtr[Ytr[:,0]==-1]
Xpos = Xtr[Ytr[:,0]==1]

# 最小データ数分だけ各カテゴリから抽出し結合
Xtr = np.concatenate([Xneg[:minNum],Xpos[:minNum]],axis=0)
Ytr = np.concatenate([-1*np.ones(shape=[minNum,1]),1*np.ones(shape=[minNum,1])])
#-------------------
'''

#-------------------
# 3. 線形判別モデルの学習
myModel = lda.LDA(Xtr, Ytr)
myModel.train()
#-------------------

#-------------------
# 4. 線形判別モデルの評価
print(f"モデルパラメータ:\nw={myModel.w},\n平均m={myModel.m}")
print(f"正解率={myModel.accuracy(Xte,Yte):.2f}")
#-------------------

#-------------------
# 5. 真値と予測値のプロット
if Xtr.shape[1] == 2:
    myModel.plotModel2D(
        X=Xtr,
        Y=Ytr,
コード例 #12
0
df = bc_df.copy()
del df['ID'] #Dropping an irrelevant feature that has nothing to do with the prediction of whether a tumor is benign or not


df['class_modified'] = pd.to_numeric((df['Class'] == 4)).astype(int)
df['Bare_Nuclei'] = pd.to_numeric(df['Bare_Nuclei']).astype(int)


#Standardize data
for column in df.columns[0:9]:
    df[column] = (df[column] - df[column].mean()) / df[column].std()


#LDA

LDA_BC = LDA()
df.insert(0, "Constant", 1)

df_copy = df.copy()
df_copy = df_copy.drop(columns=['Class'])

X = df_copy[df_copy.columns[0:10]]
Y = df_copy["class_modified"]


def k_fold_CV(data, model, k):

    all_data = data.iloc[np.random.permutation(len(data))]
    data_split = np.array_split(data, k)
    accuracies = np.ones(k)
コード例 #13
0
# clf.fit(X_wines[:int(0.7*len(X_wines))], y_wines[:int(0.7*len(X_wines))])
# predicted_y = clf.predict(X_wines[int(0.7*len(X_wines)):])
# print(evaluate_acc(predicted_y,y_wines[int(0.7*len(X_wines)):]))

# X_wines, y_wines = process_wines()
# clf = LDA()
# print("LDA on wines- Zachary",cross_validation(clf,X_wines,y_wines,5))
#
# X_tumors, y_tumors = process_tumors()
# clf = LDA()
# print("LDA on tumors - Zachary",cross_validation(clf,X_tumors,y_tumors,5))
#

X_wines, y_wines = process_wines()
start = time.time()
clf = LDA(X_wines[:int(0.8 * len(X_wines))])
print("LDA on wines", cross_validation(clf, X_wines, y_wines, 5))
end = time.time()
print("LDA on wines time", (end - start) / 5)

X_tumors, y_tumors = process_tumors()
start = time.time()
clf = LDA(X_tumors[:int(0.8 * len(X_tumors))])
print("LDA on tumors", cross_validation(clf, X_tumors, y_tumors, 5))
end = time.time()
print("LDA on tumors time", (end - start) / 5)
# #
# #
X_wines, y_wines = process_wines()
start = time.time()
clf = Logistic(0.01, 1000)
コード例 #14
0
    # y_pred = LDA.knn(breast_lower_dimension_train, breast_train_y.values.ravel(), breast_lower_dimension_test)
    # acc = LDA.compute_accuracy(y_pred, breast_test_y.values.ravel())

    # print("=================== IONOSPHERE ==============")
    # ionosphere_train_x_selection, ionosphere_test_x_selection = featureSelection(ionosphere_train_x.values, ionosphere_train_y.values.ravel(), ionosphere_test_x.values)
    # ionosphere_lower_dimension_train, ionosphere_lower_dimension_test = LDA.LDA(ionosphere_train_x_selection, ionosphere_train_y.values.ravel(), ionosphere_test_x_selection, ionosphere_test_y.values.ravel(), 'ionosphere')
    # prior, train_mean, train_cov = NBC.train(ionosphere_train_x.values, ionosphere_train_y.values.ravel(), CLASS_NUM)
    # acc = NBC.test(ionosphere_test_x.values, ionosphere_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'ionosphere', 'NBC', True)
    # # # Project to lower dimension
    # # crossValidation(iris_lower_dimension_train, iris_train_y, CLASS_NUM, 'iris', 'NBC', K)        # BUG
    # prior, train_mean, train_cov = NBC.train(ionosphere_lower_dimension_train, ionosphere_train_y.values.ravel(), CLASS_NUM)
    # acc = NBC.test(ionosphere_lower_dimension_test, ionosphere_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'ionosphere_lower', 'NBC', True)

    print("=================== WINE ==============")
    wine_train_x_selection, wine_test_x_selection = featureSelection(wine_train_x.values, wine_train_y.values.ravel(), wine_test_x.values)
    wine_lower_dimension_train, wine_lower_dimension_test = LDA.LDA(wine_train_x_selection, wine_train_y.values.ravel(), wine_test_x_selection, wine_test_y.values.ravel(), 'wine')
    # prior, train_mean, train_cov = NBC.train(wine_train_x.values, wine_train_y.values.ravel(), CLASS_NUM)
    # acc = NBC.test(wine_test_x.values, wine_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'wine', 'NBC', True)
    # Pocket classifier
    # crossValidation(ionosphere_train_x, ionosphere_train_y, class_num, 'ionosphere', 'PC', K)
    train_weight = PC.train(wine_train_x.values, wine_train_y.values.ravel(), CLASS_NUM)
    acc = PC.test(wine_test_x.values, wine_test_y.values.ravel(), train_weight, CLASS_NUM, 'wine', 'PC', True)
    
    # # Project to lower dimension
    # # crossValidation(iris_lower_dimension_train, iris_train_y, CLASS_NUM, 'iris', 'NBC', K)        # BUG
    # prior, train_mean, train_cov = NBC.train(wine_lower_dimension_train, wine_train_y.values.ravel(), CLASS_NUM)
    # acc = NBC.test(wine_lower_dimension_test, wine_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'wine_lower', 'NBC', True)

    # Pocket classifier
    # crossValidation(ionosphere_train_x, ionosphere_train_y, class_num, 'ionosphere', 'PC', K)
    train_weight = PC.train(wine_lower_dimension_train, wine_train_y.values.ravel(), CLASS_NUM)
コード例 #15
0
    #### preprocess before LDA
    dict_title_preprocessed = lda.texts_preprocess(dict_title)
    dict_description_preprocessed = lda.texts_preprocess(dict_description)
    list_title_preprocessed = list(dict_title_preprocessed.values())
    list_description_preprocessed = list(
        dict_description_preprocessed.values())
    print("text preprocessed done!")

    #### generate item title and description similarity for selected items
    item_tt_id_lst = list(train_item_id.keys()) + list(test_item_id.keys())
    item_total_id_lst = list(dict_title.keys())
    index_lst = []
    for id in item_tt_id_lst:
        index_lst.append(item_total_id_lst.index(id))
    title_similarity = lda.LDA(texts=list_title_preprocessed,
                               index_lst=index_lst,
                               num_topics=finput_topic_num)
    description_similarity = lda.LDA(texts=list_description_preprocessed,
                                     index_lst=index_lst,
                                     num_topics=finput_topic_num)
    print("lda similarity calculated done!")

    #### generate train/test item similarity matrix
    df_title_similarity_matrix = pd.DataFrame(np.array(title_similarity),
                                              index=item_tt_id_lst,
                                              columns=item_tt_id_lst)
    df_description_similarity_matrix = pd.DataFrame(
        np.array(description_similarity),
        index=item_tt_id_lst,
        columns=item_tt_id_lst)
    # train_item_id = rw.readffile(finput_train_item_id)
コード例 #16
0
import Potential
import PhysTools
import LDA
import PlotResults

# xrdb -load /dev/null
# xrdb -query

if __name__ == '__main__':
    temp = 40 * 1e-9  # temperature in unit of Kelvins
    lattice_er = 6  # lattices depth in recoil energy
    green_er = 0.1  # 532 green boxtrap depth
    m = 350  # magnetic flux density
    scattlength = PhysTools.scatlength(m)
    atom = PhysTools.Lithium(a=scattlength)
    boxtrap = Potential.BoxTrap(SheetEr=green_er,
                                HoleEr=green_er,
                                SheetD=30,
                                HoleD=30,
                                atom=atom)
    lattices = Potential.TopHatLattices(
        Er=[lattice_er, lattice_er, lattice_er], atom=atom)
    mfield = Potential.BiasMagneticField(B=350, curv_I=0.134363)
    lda0 = LDA.LDA(lattices=lattices,
                   boxtrap=boxtrap,
                   mfield=mfield,
                   Global_mu=-2)
    PlotResults.plotresults(lda0, 0.6, m)
コード例 #17
0
np.random.seed(1234)

# GENERATE DATA
# word to be 0 - 4

print('generate sample...')

N = [1000, 1000, 1000]

theta = np.array([[0.9, 0.05, 0.05], [0.1, 0.7, 0.2], [0.1, 0.2, 0.7]])

beta = np.array([[0, 0.3, 0, 0.6, 0.1], [0.8, 0.05, 0.05, 0.05, 0.05],
                 [0.05, 0.05, 0.5, 0, 0.4]])

print('theta')
print(theta)
print('beta')
print(beta)
print()

Y = []

for i in range(3):
    yi = np.zeros(N[i], dtype=int)
    for j in range(N[i]):
        topic = np.random.choice(3, p=theta[i, :])
        yi[j] = np.random.choice(5, p=beta[topic, :])
    Y.append(yi)

LDA.LDA(Y, 3, 3, 5)
コード例 #18
0
docs_bus = reviews_merged_bus.values()

with open('../output/reviews_merged_bus.pickle', 'wb') as f:
    pickle.dump(reviews_merged_bus, f)

with open('../output/docs_bars_bus.pickle', 'wb') as f:
    pickle.dump(docs_bus, f)

with open('../output/bus_ids_bars_LDA.pickle', 'wb') as f:
    pickle.dump(reviews_merged_bus.keys(), f)

lda_bus = LDA.LDA(
    alpha=alpha,
    eta=eta,
    n_topics=n_topics,
    n_features=n_features,
    max_df=max_df,
    min_df=min_df,
    max_iter=max_iter,
)
lda_bus.vectorizecounts(docs_bus)
lda_bus.fitLDA()
LDA.SaveLDAModel('../output/LDA_model_bus.pickle', lda_bus)

# The topic vector for a given business is given by this dataframe.
bus_lda_ids = pickle.load(open('../output/bus_ids_bars_LDA.pickle', 'rb'))
bus_vectors = pd.DataFrame()
bus_vectors['business_id'] = bus_lda_ids
transformed = lda_bus.lda.transform(lda_bus.tf)

bus_vectors['topic_vector'] = [bus_topic_vec for bus_topic_vec in transformed]
コード例 #19
0
import SVM
import lr
import Bayes
import LDA
LDA.LDA()
Bayes.Bayes()
SVM.svmwch()
lr.lr()
コード例 #20
0
def perform_lda(train_dataset, train_labelset, test_dataset):
    lda = LDA.LDA(train_dataset, train_labelset)
    projection_matrix, projected_train_data = lda.fit()
    print(np.shape(projection_matrix), np.shape(np.shape(test_dataset)))
    projected_test_data = lda.test_fit(projection_matrix, test_dataset)
    return projected_train_data, projected_test_data
コード例 #21
0
import time
import BCWDataset, WQDataset
import LDA, LogisticRegression
import KFoldCrossValidator

bcwd = BCWDataset.BCWDataset()
bcwd.load()
wqd = WQDataset.WQDataset()
wqd.load()

print("LDA, BCW")
print(KFoldCrossValidator.validate(LDA.LDA(), 5, bcwd.X, bcwd.y))
print("LogReg, BCW")
print(
    KFoldCrossValidator.validate(
        LogisticRegression.LogisticRegression(flr=0.6, slr=0.1, num_it=100), 5,
        bcwd.X, bcwd.y))
print("LDA, WQ")
print(KFoldCrossValidator.validate(LDA.LDA(), 5, wqd.X, wqd.y))
print("LogReg, WQ")
print(
    KFoldCrossValidator.validate(
        LogisticRegression.LogisticRegression(flr=0.6, slr=0.1, num_it=100), 5,
        wqd.X, wqd.y))
コード例 #22
0
    print('K', lda.K)
    print('_uniqTermSet', lda._uniqTermSet)
    print('docsSize', lda._docNum)
    print('termSize', lda._termNum)
    print('Z ini:', lda.Z)
    print('docTopic ini', lda._docTopic)  ##4 doc,2topic
    print('lda.termTopic', lda._termTopic)
    print('lda.Phi', lda.Phi)
    print('lda.Theta', lda.Theta)


if __name__ == "__main__":
    corpus = [
        "With all of the critical success Downey had experienced throughout his career, he had not appeared in a blockbuster film. That changed in 2008 when Downey starred in two critically and commercially successful films, Iron Man and Tropic Thunder. In the article Ben Stiller wrote for Downey's entry in the 2008 edition of The Time 100, he offered an observation on Downey's commercially successful summer at the box office.",
        "On June 14, 2010, Downey and his wife Susan opened their own production company called Team Downey. Their first project was The Judge.",
        "Robert John Downey Jr. is an American actor, producer, and singer. His career has been characterized by critical and popular success in his youth, followed by a period of substance abuse and legal troubles, before a resurgence of commercial success in middle age.",
        "In 2008, Downey was named by Time magazine among the 100 most influential people in the world, and from 2013 to 2015, he was listed by Forbes as Hollywood's highest-paid actor. His films have grossed over $14.4 billion worldwide, making him the second highest-grossing box-office star of all time."
    ]

    X = [i.split(' ') for i in corpus]
    lda = LDA.LDA()
    lda.fit(X)

    printAttr(lda)

    #fig,ax= lda.plotDocTopicDist(2)

    #fig,ax = lda.plotTermTopicDist(2)

    #fig,ax = lda.plotTopicTermDist(1)
    plt.show()
コード例 #23
0
ファイル: hw7.py プロジェクト: Dada870423/ML
sample_image = test_images[random.sample(range(len(test_label)), 10)]

if input_.mode == 0:
    ## Doing PCA and get the eigenface and W(dimension reduction)
    PCA_mean, PCA_EigenFace, PCA_W = PCA(images=images,
                                         Size=Size,
                                         FacePath="./PCA/EigenFace/")
    Reconstruct(EigenFace=PCA_EigenFace,
                sample_image=sample_image,
                Size=Size,
                Path="./PCA/")

    ## Doing LDA and get the fisherface and W(dimension reduction)
    LDA_mean, LDA_EigenFace, LDA_W = LDA(images=images,
                                         Size=Size,
                                         label=label,
                                         FacePath="./LDA/EigenFace/")
    Reconstruct(EigenFace=LDA_EigenFace,
                sample_image=sample_image,
                Size=Size,
                Path="./LDA/")

elif input_.mode == 1:
    ## Doing PCA and get the eigenface and W(dimension reduction)
    print("PCA:")
    PCA_mean, PCA_EigenFace, PCA_W = PCA(images=images,
                                         Size=Size,
                                         FacePath=None)
    ## Using PCA Knn on test image sets, I try to label the test images.
    KNN("PCA", k = 3, images = images, EigenFace = PCA_EigenFace.T, proj_train_image = PCA_W, label = label, \
        test_images = test_images, test_label = test_label)
コード例 #24
0
import LoadImage
import LBP
import LDA
ClassNum = 40
countInSameClass = 10
image_total = ClassNum * countInSameClass
sizeOfImage = 112 * 92
if __name__ == '__main__':
    FaceMat, label = LoadImage.loadImage('./ORL/s', ClassNum, countInSameClass,
                                         image_total, sizeOfImage)
    # FaceMat_fromLBP = LBP.LBP(92,112,FaceMat)
    LDA.LDA(FaceMat.T, label)
コード例 #25
0
ax.plot(X_pca_projected[30:],
        np.zeros(30),
        linestyle='None',
        marker='o',
        markersize=7,
        color='blue')
ax.set_xlabel('PC1')
ax.set_ylabel('')
ax.set_title('Projected of X onto PC1')
fig.show()
fig.savefig('Projected of X onto PC1')

X.shape[0]

y = np.array(y)
W = lda.LDA(X, y)
set(y)

X_Wproj = X.dot(W)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(2, 1, 1)
ax.plot(X_Wproj[0:30],
        np.zeros(30),
        linestyle='None',
        marker='o',
        markersize=5,
        color='orange')
ax.plot(X_Wproj[30:],
        np.zeros(30),
        linestyle='None',
コード例 #26
0
    for i in learn:
        for j in ite:
            LRModel = lr.LogisticRegression(i, j)
            ave = 0.0
            for k in range(3):
                ac = LRKFoldValidation(LRModel, rwClear, 5)
                print("per k fold:", ac)
                ave += ac
            ave = ave / 3.0
            print("ave:", ave)
            if ave > max_acc:
                max_acc = ave
                bestLearn = i
                bestIte = j
            print(ave, " ", i, " ", j)
    print(bestLearn)
    print(bestIte)
    print(max_acc)


LRModel = lr.LogisticRegression(0.001, 500)
LDAModel = LDA.LDA()
rwNormalized = genRWNormalized()
cancerNormalized = genCancerNormalized()
rwNormalized = genRWNormalized()
print(LRKFoldValidation(LRModel, cancerNormalized, 5))
print(LDAKFoldValidation(LDAModel, cancerNormalized, 5))
print(LRKFoldValidation(LRModel, rwNormalized, 5))
print(LDAKFoldValidation(LDAModel, rwNormalized, 5))
コード例 #27
0
def featureSelection(data, isLR):
    selectedFeatureNum = []
    selectedFeatureArray = -1
    bestAccuracyAll = 0
    y_2d = np.array([data[:, -1]]).T
    #print(y_2d)
    for i in range(data.shape[1] - 1):
        featureToAdd = -1
        bestAccuracy = 0
        column_2d = -1
        print("select feature{}".format(i))
        if i == 0:
            for j in range(data.shape[1] - 1):
                if (j in selectedFeatureNum) == False:
                    column_2d = np.array([data[:, j]]).T
                    nums = selectedFeatureNum + [j]

                    # ------5 should be changed --
                    #print(np.concatenate((column_2d,y_2d), axis = 1))
                    if isLR:
                        model = lr.LogisticRegression(0.001, 500)
                        accuracy = LRKFoldValidation(
                            model, np.concatenate((column_2d, y_2d), axis=1),
                            5)
                    else:
                        model = LDA.LDA()
                        accuracy = LDAKFoldValidation(
                            model, np.concatenate((column_2d, y_2d), axis=1),
                            5)

                    print("Using feature(s){} accuracy is{}".format(
                        nums, accuracy))
                    if accuracy >= bestAccuracy:
                        bestAccuracy = accuracy
                        featureToAdd = j
            selectedFeatureArray = column_2d
            bestAccuracyAll = bestAccuracy
            selectedFeatureNum.append(featureToAdd)
            continue
        else:
            #try add feature from the rest of set
            for j in range(data.shape[1] - 1):
                if (j in selectedFeatureNum) == False:
                    column_2d = np.array([data[:, j]]).T
                    nums = selectedFeatureNum + [j]

                    # ------5 should be changed ---
                    #print(np.concatenate((selectedFeatureArray, column_2d , y_2d), axis = 1))
                    if isLR:
                        model = lr.lr.LogisticRegression(0.001, 500)
                        accuracy = LRKFoldValidation(
                            model,
                            np.concatenate(
                                (selectedFeatureArray, column_2d, y_2d),
                                axis=1), 5)
                    else:
                        model = LDA.LDA
                        accuracy = LDAKFoldValidation(
                            model,
                            np.concatenate(
                                (selectedFeatureArray, column_2d, y_2d),
                                axis=1), 5)
                    print("Using feature(s){} accuracy is{}".format(
                        nums, accuracy))
                    if accuracy >= bestAccuracy:
                        bestAccuracy = accuracy
                        featureToAdd = j

        #additional feature cannot improve performance by 1%
        if bestAccuracyAll >= bestAccuracy:
            print("maxima reached")
            break
        else:
            #add addtional feature
            bestAccuracyAll = bestAccuracy
            selectedFeatureNum.append(featureToAdd)
            selectedFeatureArray = np.concatenate(
                (selectedFeatureArray, np.array([data[:, featureToAdd]]).T),
                axis=1)
    print(
        "feature selection ended, best performing features are {}, the accuracy is {}"
        .format(selectedFeatureNum, bestAccuracyAll))
    return selectedFeatureNum, selectedFeatureArray
コード例 #28
0
def testDataPreprocess():
    rwData = genRW()
    cancerData = genCancer()
    rwNormalized = genRWNormalized()
    cancerNormalized = genCancerNormalized()
    rwRemovedOL = genRWRemovedOL()
    cancerRemovedOL = genCancerRemovedOL()
    rwClear = genRWClear()
    cancerClear = genCancerClear()
    LRModel = lr.LogisticRegression(0.001, 500)
    LDAModel = LDA.LDA()

    a = 0
    b = 0
    c = 0
    d = 0
    for i in range(3):
        np.random.shuffle(rwData)
        np.random.shuffle(cancerData)
        a += LRKFoldValidation(LRModel, rwData, 5)
        b += LDAKFoldValidation(LDAModel, rwData, 5)
        c += LRKFoldValidation(LRModel, cancerData, 5)
        d += LDAKFoldValidation(LDAModel, cancerData, 5)

    print(a / 3)
    print(b / 3)
    print(c / 3)
    print(d / 3)

    a2 = 0
    b2 = 0
    c2 = 0
    d2 = 0
    for i in range(3):
        np.random.shuffle(rwNormalized)
        np.random.shuffle(cancerNormalized)
        a2 += LRKFoldValidation(LRModel, rwNormalized, 5)
        b2 += LDAKFoldValidation(LDAModel, rwNormalized, 5)
        c2 += LRKFoldValidation(LRModel, cancerNormalized, 5)
        d2 += LDAKFoldValidation(LDAModel, cancerNormalized, 5)
    print(a2 / 3)
    print(b2 / 3)
    print(c2 / 3)
    print(d2 / 3)

    a3 = 0
    b3 = 0
    c3 = 0
    d3 = 0
    for i in range(3):
        np.random.shuffle(rwClear)
        np.random.shuffle(cancerClear)
        a3 += LRKFoldValidation(LRModel, rwClear, 5)
        b3 += LDAKFoldValidation(LDAModel, rwClear, 5)
        c3 += LRKFoldValidation(LRModel, cancerClear, 5)
        d3 += LDAKFoldValidation(LDAModel, cancerClear, 5)
    print(a3 / 3)
    print(b3 / 3)
    print(c3 / 3)
    print(d3 / 3)

    a4 = 0
    b4 = 0
    c4 = 0
    d4 = 0
    for i in range(3):
        np.random.shuffle(rwRemovedOL)
        np.random.shuffle(cancerRemovedOL)
        a4 += LRKFoldValidation(LRModel, rwRemovedOL, 5)
        b4 += LDAKFoldValidation(LDAModel, rwRemovedOL, 5)
        c4 += LRKFoldValidation(LRModel, cancerRemovedOL, 5)
        d4 += LDAKFoldValidation(LDAModel, cancerRemovedOL, 5)
    print(a4 / 3)
    print(b4 / 3)
    print(c4 / 3)
    print(d4 / 3)
コード例 #29
0
#for userId, user in dic_user.iteritems():
#	print(str(userId) + " " + str(len(user.tweet_set)))

k_topics = num_topics
LDA_iterations = num_iterations
sentimentPoints = getSentimentPoints()
#print(sentimentPoints)

dictionary, corpus, out_set = preprocessing(doc_set)

for i in range(0,len(out_set)):
	tweet_set[i].wordSet = out_set[i]

sentimentsOfTweets = getSentimentScoreOfTweets(out_set)
model = LDA(dictionary, corpus, k_topics, LDA_iterations)

for i in range(0,len(sentimentsOfTweets)):
	tweet_set[i].russell_tuple = sentimentsOfTweets[i]

sentDic = loadDict()

dictByTopic = []
tempDic = {}
topics = model.get_topics()

for topic in topics:
	tempDic = {}
	for i in range(0,len(topic)):
		tempDic[dictionary[i]] = topic[i]
	dictByTopic.append(tempDic)
コード例 #30
0
plt.colorbar()
plt.show()


# part c
# Reference: https://github.com/scikit-learn/scikit-learn/blob/7389dba/sklearn/discriminant_analysis.py

# First, split the data into training and validation sets
data_size = len(training_data_norm)
indices = np.random.permutation(data_size)
x_val, y_val = training_data_norm[indices][:10000], training_labels[indices][:10000]
x_train, y_train = training_data_norm[indices][10000:], training_labels[indices][10000:]
y_val.flatten()

nums = [100, 200, 500, 1000, 2000, 5000, 10000, 30000, 50000]
model1, model2 = LDA.LDA(), QDA.QDA()
lda_score, qda_score = [], []
for i in nums:
    model1.fit(x_train[:i], y_train[:i])
    model2.fit(x_train[:i], y_train[:i])
    lda_pred = model1.predict(x_val)
    qda_pred = model2.predict(x_val)
    lda_err = 1 - np.sum(lda_pred == y_val)/y_val.shape[0]
    lda_score.append(lda_err)
    qda_err = 1 - np.sum(qda_pred == y_val)/y_val.shape[0]
    qda_score.append(qda_err)

print(lda_score, qda_score)
plt.plot(nums, lda_score, 'ro', label="LDA")
plt.plot(nums, qda_score, 'yo', label="QDA")
plt.xlabel('numbers of training examples')