Python LDA 예제들, LDA Python 예제들

예제 #1

0

파일 보기

def three():
    app = Flask(__name__)
    app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True

    result = LDA.LDA(10)  ##문서 10개 돌림
    # print
    return json.dumps(result, ensure_ascii=False)

예제 #2

0

파일 보기

파일: terms_analysis.py 프로젝트: hxiaofeng/HTopicModel

def lda_terms_analysis(lda_model_filename, word2vec_model_filename):
    topics = LDA.get_topics_terms(lda_model_filename)
    word2vec = models.Word2Vec.load(word2vec_model_filename)
    new_topics = []
    useless = []
    for topic in topics:
        words = topic[-1]
        dictionary, matrix = get_words_matrix(words, word2vec)
        clusters, centers = cluster(matrix, dictionary, 2, 10)
        cohesions = []
        for c in clusters.items():
            sub_words = c[-1]
            label = c[0]
            _, sub_matrix = get_words_matrix(sub_words, word2vec)
            center = centers[label]
            cohesion = utilities.cohesion(sub_matrix, center)
            cohesions.append((label, cohesion))
        cohesions.sort(key=lambda x: x[-1])
        new_topic = list(topic[:-1])
        new_topic.append(cohesions[0][1])
        new_topic.append(clusters[cohesions[0][0]])
        new_topics.append(new_topic)
        for c in cohesions[1:]:
            u_topic = list(topic[:-1])
            u_topic.append(c[0])
            u_topic.append(c[1])
            u_topic.append(clusters[c[0]])
            useless.append(u_topic)
    return new_topics, useless

예제 #3

0

파일 보기

파일: TweetFeatureExtractor.py 프로젝트: mpoiitis/hashtag-popularity-prediction

    def __get_topic(self):
        """
            returns a dictionary of (hashtag: topic) attributes using Latent Dirichlet Allocation
        """
        tweet_topic = {}
        tweet_data = []
        for tweet in self.tweets:
            tweet_topic[tweet["id_str"]] = ""

            text = self.get_tweet_text(tweet)
            tweet_data.append((text, tweet["id_str"]))

        lda = LDA.LDA(tweet_data)

        for tweet in self.tweets:
            text = self.get_tweet_text(tweet)
            tweet_topic[tweet["id_str"]] = lda.predict_with_bag(text)
            # tweet_topic[tweet["id_str"]] = lda.predict_with_tf_idf(text)

        hashtag_topic = {}
        for hashtag in self.hashtags:
            hashtag_topic[hashtag["text"]] = []

        for tweet, hashtagList in self.tweet_hashtag_map.items():
            for hashtag in hashtagList:
                hashtag_topic[hashtag["text"]].append(tweet_topic[tweet])

        #hashtag topic is the the topic of the majority of hashtag's tweets
        hashtag_topic = {hashtag: self.most_common(l) for hashtag, l in
                             hashtag_topic.items()}
        return hashtag_topic

예제 #4

0

파일 보기

def main():
    filename = '../resource/train.csv'
    itemid, numattr, cateattr, label = readfile(filename)
    totalnum = len(numattr)
    testnum = totalnum * 0.1
    testnum = int(testnum)
    trainnum = totalnum - testnum
    trainnumattr = numattr[0: trainnum]
    traincateattr = cateattr[0: trainnum]
    trainlabel = label[0: trainnum]
    testnumattr = numattr[trainnum:]
    testcateattr = cateattr[trainnum:]
    testlabel = label[trainnum:]
    multidim = MultiDimension(traincateattr)
    trainextattr = multidim.gettrainextattr()
    testextattr = multidim.gettestextattr(testcateattr)
    trainattr = append(trainnumattr, trainextattr, axis = 1)
    testattr = append(testnumattr, testextattr, axis = 1)
    LDAcoe = LDA(trainattr, trainlabel)
    LDAtrainattr = conpress(trainattr, LDAcoe)
    LDAtestattr = conpress(testattr, LDAcoe)
    for i in range(20):
        print LDAtrainattr[i]
    import sys
    sys.exit(1)
    model = WeightedModel(LDAtrainattr, trainlabel)
    right = 0
    for i in range(testnum):
        p = model.predict(LDAtestattr[i])
        if p == testlabel[i]:
            right += 1
    accuracy = float(right) / testnum
    print 'accuracy:', accuracy

예제 #5

0

파일 보기

파일: test.py 프로젝트: INPUTrrr0/logisitic-regression-and-LDA-application

def testSelectedFeatures1():
    print("start testAdditionlSquaredFeatures()")
    LRModel = lr.LogisticRegression(0.001, 500)
    LDAModel = LDA.LDA()
    data1 = genRWNormalized()
    data2 = np.append(data1[:, [10, 1, 9, 6]],
                      np.array([data1[:, -1]]).T,
                      axis=1)
    data3 = addSquareFeature(data1, [10, 1, 9, 6])
    a1 = 0
    b1 = 0
    a2 = 0
    b2 = 0
    a3 = 0
    b3 = 0
    for i in range(3):
        np.random.shuffle(data1)
        np.random.shuffle(data2)
        np.random.shuffle(data3)
        a1 += LRKFoldValidation(LRModel, data1, 5)
        b1 += LDAKFoldValidation(LDAModel, data2, 5)
        a2 += LRKFoldValidation(LRModel, data2, 5)
        b2 += LDAKFoldValidation(LDAModel, data2, 5)
        a3 += LRKFoldValidation(LRModel, data3, 5)
        b3 += LDAKFoldValidation(LDAModel, data3, 5)
    print("Accuracy for lr in rw is {}".format(a1 / 3))
    print("Accuracy for LDA in rw is {}".format(b1 / 3))
    print("Accuracy for lr in rw is {}".format(a2 / 3))
    print("Accuracy for LDA in rw is {}".format(b2 / 3))
    print("Accuracy for lr in rw is {}".format(a3 / 3))
    print("Accuracy for LDA in rw is {}".format(b3 / 3))

예제 #6

0

파일 보기

def testLDAWithWine():
    data = genDataWOHeader(file_path1)
    qualityToCategory(data)
    np.random.shuffle(data)
    #data1= removeOutLiersByND(data2)
    testSet, trainSet = seperateTestSet(data)
    aModel = LDA.LDA()
    return LDAKFoldValidation(aModel, trainSet, 5)

예제 #7

0

파일 보기

파일: Test.py 프로젝트: scs9826/FaceTest

def getResult(dataMat, label, testNum, classNum):
    """
    加载每类测试集，计算总正确率
    :param dataMat: 训练集
    :param label: 训练集标签矩阵
    :param testNum: 每类测试集的测试个数
    :param classNum: 共几类
    :return:
    """
    Count = 0
    # TODO 85
    disc_set, disc_value = LDA.pca(dataMat, 85)
    redVects, Train_LDA = LDA.lda(dataMat, label, 85, 16, 11, 176)  # LDA投影空间，最终的训练集
    for classnum in range(1, classNum + 1):
        print('第', classnum, '类')
        Count += compare(disc_set, Train_LDA, redVects, label, testNum, classnum, 5)
    print('Final correctCount:', Count / 16)

예제 #8

0

파일 보기

def testLDAWithCancer():
    data = genData(file_path2)
    classToCategory(data)
    preprocessData(data)
    np.random.shuffle(data)
    #data1= removeOutLiersByND(data2)
    testSet, trainSet = seperateTestSet(data)
    aModel = LDA.LDA()
    return LDAKFoldValidation(aModel, trainSet, 5)

예제 #9

0

파일 보기

def getResult(dataMat, label, PCA_dim, testNum, classNum, classInNum ):
    '''
    加载每类测试集，计算总正确率
    :param dataMat: 训练集
    :param label: 训练集标签矩阵
    :param testNum: 每类测试集的测试个数
    :param classNum: 共几类
    :param classInNum:每类有几个图
    :return:
    '''
    Count = 0
    # disc_set, disc_value ,meanFace= LDA.pca(dataMat, PCA_dim)
    disc_set,x,y=LDA.pca(dataMat,PCA_dim)
    Total=classNum*classInNum
    redVects, Train_LDA = LDA.lda(dataMat, label,PCA_dim, classNum, classInNum, Total)  # LDA投影空间，最终的训练集
    for classnum in range(1, classNum + 1):
        print('第',classnum,'类')
        Count += compare(disc_set, Train_LDA, redVects, label, testNum,classnum, 7)
    print('Final correctCount:', Count/classNum)

예제 #10

0

파일 보기

def small_data_test():
    test_1 = LDA.lda_function(
        [[1, 0, 1, 2, 3], [2, 1, 1, 2, 3], [2, 2, 3, 4, 5]], [[0], [1], [1]],
        3)
    result = [[-0.559, -0.281, -0.78], [-0.3336, -0.7849, 0.5219],
              [-0.759, 0.552, 0.345]]
    if test_1:
        return ("All OK")
    else:
        return ("something went wrong")

예제 #11

0

파일 보기

파일: Task1.py 프로젝트: yao0510/Pattern-Recognizion-2020-Spring-NCTU

def featureSelection(train_x, train_y, test_x):
    feature_pool = np.arange(train_x.shape[1])
    feature_selected_count = 0
    selected_feature = []
    while feature_selected_count < 2:
        best_acc = 0
        best_feature = None
        for feature in feature_pool:
            # Cannot select same features
            if feature in selected_feature:
                continue
            candidate_feature = selected_feature.copy()
            candidate_feature.append(feature)
            print()
            print("Candidate feature = {}".format(candidate_feature))
            divided = int(len(train_x) / K)
            overall_acc = 0
            total_PD = []
            total_FA = []
            for fold in range(K):
                # print()
                # print("Now fold is {}".format(fold))
                # Compute start and end index
                start = divided * fold
                end = divided * (fold + 1)
                # Filter data so candidate features
                training_x = np.concatenate((train_x[:start, candidate_feature], train_x[end:, candidate_feature]))
                training_y = np.concatenate((train_y[:start], train_y[end:]))
                validation_x = train_x[start:end, candidate_feature]
                validation_y = train_y[start:end]
                y_pred = LDA.knn(training_x, train_y, validation_x)
                acc = LDA.compute_accuracy(y_pred, validation_y)
                overall_acc += acc
            print("Overall accuracy: {}".format(overall_acc / K))
            if overall_acc >= best_acc:
                best_acc = overall_acc
                best_feature = candidate_feature
        selected_feature = candidate_feature
        feature_selected_count += 1
    print("Feature selected: {}".format(selected_feature))
    print(train_x[:, selected_feature])
    return train_x[:, selected_feature], test_x[:, selected_feature]

예제 #12

0

파일 보기

def RunTrainLDA(infile, pcaFile, ldaFile):

    import cPickle

    fp = open(infile, "r")
    dataset = cPickle.load(fp)
    subjID = cPickle.load(fp)
    fp.close()

    pca = PCA(dataset)
    pca_proj = pca.compute()

    np.save(pcaFile, pca_proj)

    lda_proj = []
    lda = LDA(dataset, subjID, pca_proj)
    projData = lda.projectData()
    lda_proj = lda.train(projData)

    np.save(ldaFile, lda_proj)

예제 #13

0

파일 보기

def identify_clusters_in_project(project_name, project_path):
    create_logging_folders(project_name)
    temp_json_location = f'{Settings.DIRECTORY}/data/output.json'

    utils.execute_parser(project_path)

    # Read parsed document
    parsed_raw_json = {}
    with open(temp_json_location) as json_file:
        parsed_raw_json = json.load(json_file)

    classes = extract_classes_information_from_parsed_json(parsed_raw_json)

    graph = nx.DiGraph()
    graph = Graph.create_dependencies(classes, graph)

    lda.apply_lda_to_classes(graph, classes)
    calculate_absolute_weights(graph, classes, weight_type=WeightType.LDA)

    # TODO : think about if the pre_processing should be done or not
    graph = Clustering.pre_process(graph,
                                   remove_weak_edges=False,
                                   remove_disconnected_sections=True)

    clusters_results = []
    if Settings.RESOLUTION:
        clusters, modularity = Clustering.community_detection_louvain(
            graph, resolution=Settings.RESOLUTION)
        clusters_results.append((clusters, modularity, Settings.RESOLUTION))
        Clustering.write_modularity_and_services(clusters_results)
    else:
        clusters_results = Clustering.compute_multiple_resolutions(graph,
                                                                   start=0.3,
                                                                   end=1.1,
                                                                   step=0.1)

    # TODO: Reconsider techniques of post-processing
    # clusters = PostProcessing.process(clusters, classes, graph.copy())
    return clusters_results

예제 #14

0

파일 보기

파일: terms_analysis.py 프로젝트: hxiaofeng/HTopicModel

def hierarchical_topic_analyse_with_silhouette(corpus_filename, word2vec_model_filename, lda_filter=False, k=1):
    if lda_filter:
        topic2terms = pickle.load(open(corpus_filename))
    else:
        topic2terms = LDA.get_topics_terms(corpus_filename)
        # topic2terms, _ = lda_terms_analysis(corpus_filename, word2vec_model_filename)
    topics = []
    for t in topic2terms:
        topics.append(list(t))
    if k == 0:
        return topics
    word2vec_model = models.Word2Vec.load(word2vec_model_filename)
    new_topics = []
    for topic in topics:
        words = topic[-1]
        clusters = cluster_analyse_with_silhouette(words, word2vec_model, k)
        for c in clusters:
            new_topic = topic[:-1]
            new_topic.extend(c)
            new_topics.append(new_topic)
    return new_topics

예제 #15

0

파일 보기

파일: terms_analysis.py 프로젝트: hxiaofeng/HTopicModel

def hierarchical_topic_analyse(lda_model_file, word2vec_model_file, k=1):
    topic2terms = LDA.get_topics_terms(lda_model_file)
    topics = []
    for t in topic2terms:
        topics.append(list(t))
    word2vec_model = models.Word2Vec.load(word2vec_model_file)
    for i in range(k):
        new_topics = []
        for t in topics:
            words = t[-1]
            dictionary, matrix = get_words_matrix(words, word2vec_model)
            clusters = cluster(matrix, dictionary, 2, 10)
            for item in clusters:
                labels = t[:-1]
                labels.extend(list(item))
                new_topics.append(labels)
            del dictionary
            del matrix
            del clusters
        topics = new_topics
    # topic_tree = get_topic_tree(topics)
    # return topic_tree
    return topics

예제 #16

0

파일 보기

파일: pca.py 프로젝트: huangxiao1234/moshishibie

ax.plot_trisurf([0, eigVects[:2][0, 0], eigVects[:2][1, 0]],
                [0, eigVects[:2][0, 1], eigVects[:2][1, 1]],
                [0, eigVects[:2][0, 2], eigVects[:2][1, 2]])

plt.subplot(224)
plt.title('投影之后')
plt.scatter(array(down_Mat)[:len(mansls), 0],
            array(down_Mat)[:len(mansls), 1],
            c='red')
plt.scatter(array(down_Mat)[len(mansls):, 0],
            array(down_Mat)[len(mansls):, 1],
            c='green')

plt.show()

#--------------降维后再使用LDA----------------------
#用降维的数据测试LDA
w, mean1, mean2, group1, group2 = LDA.train(down_Mat[:len(mansls), :],
                                            down_Mat[len(mansls):, :])
#将测试集降维
testboy, labels = LDA.getdata('boy82.txt', 0)
testboy = mat(testboy)
#训练集降维
test_Mat, test_eigvals, test_eigVects = pca_new(testboy, 2)
group3 = test_Mat.T
count = 0
for i in range(shape(group3)[1]):
    if (LDA.predict(group3[:, i].T, w, mean1, mean2))[0,
                                                      0] >= 0:  #这里要注意和测试数据对应
        count = count + 1
print('降维后的准确率：', count / shape(group3)[1])  #降维后有0.96准确率，而没降维只有0.84(用身高，体重)

예제 #17

0

파일 보기

파일: hw7.py 프로젝트: Dada870423/ML

sample_image = test_images[random.sample(range(len(test_label)), 10)]

if input_.mode == 0:
    ## Doing PCA and get the eigenface and W(dimension reduction)
    PCA_mean, PCA_EigenFace, PCA_W = PCA(images=images,
                                         Size=Size,
                                         FacePath="./PCA/EigenFace/")
    Reconstruct(EigenFace=PCA_EigenFace,
                sample_image=sample_image,
                Size=Size,
                Path="./PCA/")

    ## Doing LDA and get the fisherface and W(dimension reduction)
    LDA_mean, LDA_EigenFace, LDA_W = LDA(images=images,
                                         Size=Size,
                                         label=label,
                                         FacePath="./LDA/EigenFace/")
    Reconstruct(EigenFace=LDA_EigenFace,
                sample_image=sample_image,
                Size=Size,
                Path="./LDA/")

elif input_.mode == 1:
    ## Doing PCA and get the eigenface and W(dimension reduction)
    print("PCA:")
    PCA_mean, PCA_EigenFace, PCA_W = PCA(images=images,
                                         Size=Size,
                                         FacePath=None)
    ## Using PCA Knn on test image sets, I try to label the test images.
    KNN("PCA", k = 3, images = images, EigenFace = PCA_EigenFace.T, proj_train_image = PCA_W, label = label, \
        test_images = test_images, test_label = test_label)

예제 #18

0

파일 보기

파일: main.py 프로젝트: manmohan291/PyPrograms

import LDA as C

C.clearScreen()
dataTraining = C.loadData("dataTraining.txt")

X = dataTraining[:, 0:2]
y = dataTraining[:, 2:3]

C.plotLDA(X, y)
C.plotNormalSurface(X, y)

예제 #19

0

파일 보기

    key_set.add(keyword_rec)
    for keyword_rec_split_i in keyword_rec_split:
        key_set.add(keyword_rec_split_i)
    w2vModel = w2v.w2vModel(data)
    key_list = []
    for w2vkey in key_set:
        if w2vModel.model.wv.__contains__(w2vkey):
            key_list.append(w2vkey)
        else:
            print('false:', w2vkey, flush=True)
    words = w2vModel.get_similar_words(pos_words=key_list)
    for word in words:
        rec_words += word[0]
        rec_words += ' '

    lmodel = LDA.LDAModel(data,n_topics=100)
    topicwords = lmodel.model.print_topics(num_topics=2,num_words=5)
    wordsset = set()
    for i in topicwords:
        allwords = re.findall(r'".*?"',i[1])
        for j in allwords:
            jj = j.strip('"')
            if keyword_rec != jj:
                wordsset.add(jj)
    for i in wordsset:
        rec_words += i
        rec_words += ' '
    print(rec_words)
    print('key', len(words), flush=True)
    sqlcur.execute("REPLACE INTO keyword_recommend_t VALUES('" + str(keyword_rec) + "','" + timestr_2 + "','" + rec_words + "')")
    mysqldb.commit()

예제 #20

0

파일 보기

import time
import BCWDataset, WQDataset
import LDA, LogisticRegression
import KFoldCrossValidator

bcwd = BCWDataset.BCWDataset()
bcwd.load()
wqd = WQDataset.WQDataset()
wqd.load()

print("LDA, BCW")
print(KFoldCrossValidator.validate(LDA.LDA(), 5, bcwd.X, bcwd.y))
print("LogReg, BCW")
print(
    KFoldCrossValidator.validate(
        LogisticRegression.LogisticRegression(flr=0.6, slr=0.1, num_it=100), 5,
        bcwd.X, bcwd.y))
print("LDA, WQ")
print(KFoldCrossValidator.validate(LDA.LDA(), 5, wqd.X, wqd.y))
print("LogReg, WQ")
print(
    KFoldCrossValidator.validate(
        LogisticRegression.LogisticRegression(flr=0.6, slr=0.1, num_it=100), 5,
        wqd.X, wqd.y))

예제 #21

0

파일 보기

    def _init_trans_mat(self):
        # Check input
        if any([x is None for x in [self.X, self.labels, self.d]]):
            raise ValueError('X, labels and subdim not set!')

        num_pts = self.X.shape[0]
        D = self.X.shape[1]
        subdim = self.d

        # Setup random state
        prng = RandomState()
        if self._SEED is not None:
            prng = RandomState(self._SEED)
            if self._verbose:
                print("Setting random seed to", self._SEED)

        if self._init_method == "PCA":
            if num_pts < self.d:
                raise ValueError('num_pts < subdim')
            if self.d > D:
                raise ValueError('subdim > inputdim')

            pca = PCA(n_components=subdim, whiten=False)
            pca.fit(self.X)
            L = pca.components_.T + 1E-6

        elif self._init_method == "LDA":
            if self.d > D:
                raise ValueError('subdim > inputdim')

            lda_obj = LDA.LDA(self.X, self.labels)
            lda_obj.compute(dim=self.d)
            L = lda_obj.getTransform()
            L = L * (1. / LA.norm(L, ord=1, axis=1)).reshape(-1, 1)
        elif self._init_method == "randbeng":
            # L = 1. * bound * prng.rand(D, self.d) - bound
            L = np.random.normal(0,
                                 np.sqrt(2) / np.sqrt(self.D + self.d),
                                 (self.D, self.d))
        elif self._init_method == "randbest":
            # Do some random generation of matrices pick the one with lowest # of constraints
            if self._verbose:
                print('Doing random pre-gen L')
            t0 = timeit.default_timer()
            best_L = prng.rand(D, self.d)
            L = best_L
            self.loss_fun(best_L)
            # nconsts = self._count_active_constraints()
            bound = np.sqrt(6. / (D + self.d))
            best_N_consts = 1E10
            for i in range(0, 10):
                L = 1. * bound * prng.rand(D, self.d) - bound
                # L = 1E-5*prng.rand(D,self.d)
                # L = L * (1./LA.norm(L,ord=1,axiss=1)).reshape(-1,1)
                self.loss_fun(L)
                consts = self._count_active_constraints()
                if consts < best_N_consts:
                    best_N_consts = consts
                    best_L = copy.copy(L)
            L = copy.copy(best_L)
            if self._verbose:
                print("Pre-gen of L done. Took:",
                      "%3.3f" % (timeit.default_timer() - t0),
                      end=", ")
                print("# active const", best_N_consts, end=", ")

        elif self._init_method == "rand":
            # method_str = print('Doing random pre-gen Lapa')
            bound = np.sqrt(6. / (D + self.d))
            L = 1. * bound * prng.rand(D, self.d) - bound

        return L

예제 #22

0

파일 보기

파일: run.py 프로젝트: rohit-ganapathy/Topic-Modelling-LDA

import create_data
import LDA
import argparse

parser = argparse.ArgumentParser(description='Enter number of topics')
parser.add_argument('--n_topics', help='Number of Topics')

args = parser.parse_args()

path = "news-articles.txt"

create_data.create(path)
LDA.modeller(args.n_topics)

예제 #23

0

파일 보기

파일: test.py 프로젝트: hxiaofeng/HTopicModel

#!/usr/bin/env python
# encoding: utf-8

import LDA


if __name__ == '__main__':
    ts = LDA.get_topics_terms('../data/models/sougou_lda_50_model.md')
    out = open('topics', 'wb')
    for t, terms in ts.items():
        out.write(str(t) + '\n')
        out.write('\t'.join(terms).encode('utf8') + '\n')
        out.write('\n')
    out.close()

예제 #24

0

파일 보기

#for userId, user in dic_user.iteritems():
#	print(str(userId) + " " + str(len(user.tweet_set)))

k_topics = num_topics
LDA_iterations = num_iterations
sentimentPoints = getSentimentPoints()
#print(sentimentPoints)

dictionary, corpus, out_set = preprocessing(doc_set)

for i in range(0,len(out_set)):
	tweet_set[i].wordSet = out_set[i]

sentimentsOfTweets = getSentimentScoreOfTweets(out_set)
model = LDA(dictionary, corpus, k_topics, LDA_iterations)

for i in range(0,len(sentimentsOfTweets)):
	tweet_set[i].russell_tuple = sentimentsOfTweets[i]

sentDic = loadDict()

dictByTopic = []
tempDic = {}
topics = model.get_topics()

for topic in topics:
	tempDic = {}
	for i in range(0,len(topic)):
		tempDic[dictionary[i]] = topic[i]
	dictByTopic.append(tempDic)

예제 #25

0

파일 보기

def featureSelection(data, isLR):
    selectedFeatureNum = []
    selectedFeatureArray = -1
    bestAccuracyAll = 0
    y_2d = np.array([data[:, -1]]).T
    #print(y_2d)
    for i in range(data.shape[1] - 1):
        featureToAdd = -1
        bestAccuracy = 0
        column_2d = -1
        print("select feature{}".format(i))
        if i == 0:
            for j in range(data.shape[1] - 1):
                if (j in selectedFeatureNum) == False:
                    column_2d = np.array([data[:, j]]).T
                    nums = selectedFeatureNum + [j]

                    # ------5 should be changed --
                    #print(np.concatenate((column_2d,y_2d), axis = 1))
                    if isLR:
                        model = lr.LogisticRegression(0.001, 500)
                        accuracy = LRKFoldValidation(
                            model, np.concatenate((column_2d, y_2d), axis=1),
                            5)
                    else:
                        model = LDA.LDA()
                        accuracy = LDAKFoldValidation(
                            model, np.concatenate((column_2d, y_2d), axis=1),
                            5)

                    print("Using feature(s){} accuracy is{}".format(
                        nums, accuracy))
                    if accuracy >= bestAccuracy:
                        bestAccuracy = accuracy
                        featureToAdd = j
            selectedFeatureArray = column_2d
            bestAccuracyAll = bestAccuracy
            selectedFeatureNum.append(featureToAdd)
            continue
        else:
            #try add feature from the rest of set
            for j in range(data.shape[1] - 1):
                if (j in selectedFeatureNum) == False:
                    column_2d = np.array([data[:, j]]).T
                    nums = selectedFeatureNum + [j]

                    # ------5 should be changed ---
                    #print(np.concatenate((selectedFeatureArray, column_2d , y_2d), axis = 1))
                    if isLR:
                        model = lr.lr.LogisticRegression(0.001, 500)
                        accuracy = LRKFoldValidation(
                            model,
                            np.concatenate(
                                (selectedFeatureArray, column_2d, y_2d),
                                axis=1), 5)
                    else:
                        model = LDA.LDA
                        accuracy = LDAKFoldValidation(
                            model,
                            np.concatenate(
                                (selectedFeatureArray, column_2d, y_2d),
                                axis=1), 5)
                    print("Using feature(s){} accuracy is{}".format(
                        nums, accuracy))
                    if accuracy >= bestAccuracy:
                        bestAccuracy = accuracy
                        featureToAdd = j

        #additional feature cannot improve performance by 1%
        if bestAccuracyAll >= bestAccuracy:
            print("maxima reached")
            break
        else:
            #add addtional feature
            bestAccuracyAll = bestAccuracy
            selectedFeatureNum.append(featureToAdd)
            selectedFeatureArray = np.concatenate(
                (selectedFeatureArray, np.array([data[:, featureToAdd]]).T),
                axis=1)
    print(
        "feature selection ended, best performing features are {}, the accuracy is {}"
        .format(selectedFeatureNum, bestAccuracyAll))
    return selectedFeatureNum, selectedFeatureArray

예제 #26

0

파일 보기

            x_n = x_n.reshape(-1, 1)
            p_ks = np.empty(len(self.unique_y))
        
            for j, k in enumerate(self.unique_y):
                p_x_given_y = self._mvn_density(x_n, self.mu_ks[j], self.Sigma)
                p_y_given_x = self.pi_ks[j]*p_x_given_y
                p_ks[j] = p_y_given_x
            
            y_n[i] = self.unique_y[np.argmax(p_ks)]
        
        return y_n
            

We fit the LDA model below and classify the training observations. As the output shows, we have 100% training accuracy.

lda = LDA()
lda.fit(X, y)
yhat = lda.classify(X)
np.mean(yhat == y)

The function below visualizes class predictions based on the input values for a model with $\bx_n \in \mathbb{R}^2$. To apply this function, we build a model with only two columns from the `wine` dataset. We see that the decision boundaries are linear, as we expect from LDA.

def graph_boundaries(X, model, model_title, n0 = 100, n1 = 100, figsize = (7, 5), label_every = 4):
        
        # Generate X for plotting 
        d0_range = np.linspace(X[:,0].min(), X[:,0].max(), n0)
        d1_range = np.linspace(X[:,1].min(), X[:,1].max(), n1)
        X_plot = np.array(np.meshgrid(d0_range, d1_range)).T.reshape(-1, 2)
        
        # Get class predictions
        y_plot = model.classify(X_plot).astype(int)

예제 #27

0

파일 보기

파일: Task1.py 프로젝트: yao0510/Pattern-Recognizion-2020-Spring-NCTU

    # y_pred = LDA.knn(breast_lower_dimension_train, breast_train_y.values.ravel(), breast_lower_dimension_test)
    # acc = LDA.compute_accuracy(y_pred, breast_test_y.values.ravel())

    # print("=================== IONOSPHERE ==============")
    # ionosphere_train_x_selection, ionosphere_test_x_selection = featureSelection(ionosphere_train_x.values, ionosphere_train_y.values.ravel(), ionosphere_test_x.values)
    # ionosphere_lower_dimension_train, ionosphere_lower_dimension_test = LDA.LDA(ionosphere_train_x_selection, ionosphere_train_y.values.ravel(), ionosphere_test_x_selection, ionosphere_test_y.values.ravel(), 'ionosphere')
    # prior, train_mean, train_cov = NBC.train(ionosphere_train_x.values, ionosphere_train_y.values.ravel(), CLASS_NUM)
    # acc = NBC.test(ionosphere_test_x.values, ionosphere_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'ionosphere', 'NBC', True)
    # # # Project to lower dimension
    # # crossValidation(iris_lower_dimension_train, iris_train_y, CLASS_NUM, 'iris', 'NBC', K)        # BUG
    # prior, train_mean, train_cov = NBC.train(ionosphere_lower_dimension_train, ionosphere_train_y.values.ravel(), CLASS_NUM)
    # acc = NBC.test(ionosphere_lower_dimension_test, ionosphere_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'ionosphere_lower', 'NBC', True)

    print("=================== WINE ==============")
    wine_train_x_selection, wine_test_x_selection = featureSelection(wine_train_x.values, wine_train_y.values.ravel(), wine_test_x.values)
    wine_lower_dimension_train, wine_lower_dimension_test = LDA.LDA(wine_train_x_selection, wine_train_y.values.ravel(), wine_test_x_selection, wine_test_y.values.ravel(), 'wine')
    # prior, train_mean, train_cov = NBC.train(wine_train_x.values, wine_train_y.values.ravel(), CLASS_NUM)
    # acc = NBC.test(wine_test_x.values, wine_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'wine', 'NBC', True)
    # Pocket classifier
    # crossValidation(ionosphere_train_x, ionosphere_train_y, class_num, 'ionosphere', 'PC', K)
    train_weight = PC.train(wine_train_x.values, wine_train_y.values.ravel(), CLASS_NUM)
    acc = PC.test(wine_test_x.values, wine_test_y.values.ravel(), train_weight, CLASS_NUM, 'wine', 'PC', True)
    
    # # Project to lower dimension
    # # crossValidation(iris_lower_dimension_train, iris_train_y, CLASS_NUM, 'iris', 'NBC', K)        # BUG
    # prior, train_mean, train_cov = NBC.train(wine_lower_dimension_train, wine_train_y.values.ravel(), CLASS_NUM)
    # acc = NBC.test(wine_lower_dimension_test, wine_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'wine_lower', 'NBC', True)

    # Pocket classifier
    # crossValidation(ionosphere_train_x, ionosphere_train_y, class_num, 'ionosphere', 'PC', K)
    train_weight = PC.train(wine_lower_dimension_train, wine_train_y.values.ravel(), CLASS_NUM)

예제 #28

0

파일 보기

파일: analyzerSKL.py 프로젝트: kelayamatoz/LegalDocAutoHighlighter

reload(sys)
sys.setdefaultencoding('utf8')

# inputSrc = './sampleTest.txt'
inputSrc = './appleLegalDocDir/apple-osx-test.txt'
minTokenNum = 4
tokenList = list()
sentenceList = list()

with open(inputSrc) as f:
    text = f.read()

# Task 0: use LDA to filter out common topics. This way the main popular topic words won't affect
# calculation of similarities.

ldaModel = LDA.loadModel()
topics, topicCoherencePair = LDA.getTopicsWithNormalizedWeight(ldaModel)

sentences = vocabBuilder.tokenize(text)
for sentence in sentences:
    tokens = list()
    tmpTokens = sentence.split()
    for token in tmpTokens:
        if token not in topics:
            tokens.append(token)

    if len(tokens) >= minTokenNum:
        tokenList.append(tokens)
        sentenceList.append(sentence)
print 'There are ', len(sentenceList), ' sentences in the input sample.'

예제 #29

0

파일 보기

def SJ_interest_measurement_run():
    db_client = MongoClient('mongodb://%s:%s@%s' %
                            (MONGODB_ID, MONGODB_PW, MONGODB_HOST))
    db = db_client["soojle"]

    renewal_time = find_variable(db, 'renewal')

    USER_list = find_user_renewal(db, renewal_time)
    USER_list = list(USER_list)

    ACTION_DAY_CHECK = get_default_day(SJ_USER_ACTION_DAY_CHECK)

    CATEGORY_list = find_all_category_of_topic(db)
    CATEGORY_list = list(CATEGORY_list)
    CATEGORY = {}
    #뉴스피드 관심도 태그 구하기에서 빠른 접근을 위해서 dict형식으로 변환
    for cate in CATEGORY_list:
        CATEGORY[cate['category_name']] = cate['tag']

    for USER in USER_list:
        #좋아요/조회수가 하나도 없는 회원은 측정 안함.
        if (len(USER['fav_list']) == 0) and (len(USER['view_list']) == 0):
            continue

        user_log_backup(db, USER)

        fav_tag = []
        view_tag = []
        newsfeed_tag = []
        fav_token = []
        view_token = []
        search_list = []

        #사용자가 관심 기능을 수행한 게시물 ##########################
        fav_topic = (np.zeros(LDA.NUM_TOPICS))
        if len(
                USER['fav_list']
        ) <= SJ_USER_LOG_LIMIT['fav'] * SJ_USER_ACTION_NUM_CHECK_PERCENT:
            for fav in USER['fav_list']:
                fav_topic += fav['topic']
                fav_tag += fav['tag']
                fav_token += fav['token']
        else:
            for fav in USER['fav_list']:
                if fav['date'] < ACTION_DAY_CHECK: continue
                fav_topic += fav['topic']
                fav_tag += fav['tag']
                fav_token += fav['token']

        #FAS 구하기
        fav_doc = (fav_tag + fav_token) * 2

        #사용자가 접근을 수행한 게시물 ##############################
        view_topic = (np.zeros(LDA.NUM_TOPICS))
        if len(
                USER['view_list']
        ) <= SJ_USER_LOG_LIMIT['view'] * SJ_USER_ACTION_NUM_CHECK_PERCENT:
            for view in USER['view_list']:
                view_topic += view['topic']
                view_tag += view['tag']
                view_token += view['token']
        else:
            for view in USER['view_list']:
                if view['date'] < ACTION_DAY_CHECK: continue
                view_topic += view['topic']
                view_tag += view['tag']
                view_token += view['token']

        #FAS 구하기
        view_doc = view_tag + view_token

        #사용자가 검색을 수행한 키워드 ##############################
        if len(
                USER['search_list']
        ) <= SJ_USER_LOG_LIMIT['search'] * SJ_USER_ACTION_NUM_CHECK_PERCENT:
            for search in USER['search_list']:
                search_list += search['tokenizer_split']
        else:
            for search in USER['search_list']:
                if search['date'] < ACTION_DAY_CHECK: continue
                search_list += search['tokenizer_split']

        search_topic = LDA.get_topics(search_list)
        search_doc = search_list

        #사용자가 접근한 뉴스피드 ################################
        A_NUM = 0  #대학교
        B_NUM = 0  #동아리&모임
        C_NUM = 0  #공모전&행사
        D_NUM = 0  #진로&구인
        E_NUM = 0  #자유

        if len(
                USER['newsfeed_list']
        ) <= SJ_USER_LOG_LIMIT['newsfeed'] * SJ_USER_ACTION_NUM_CHECK_PERCENT:
            for newsfeed in USER['newsfeed_list']:
                if newsfeed['newsfeed_name'] == '대학교': A_NUM += 1
                elif newsfeed['newsfeed_name'] == '동아리&모임': B_NUM += 1
                elif newsfeed['newsfeed_name'] == '공모전&행사': C_NUM += 1
                elif newsfeed['newsfeed_name'] == '진로&구인': D_NUM += 1
                else: E_NUM += 1
        else:
            for newsfeed in USER['newsfeed_list']:
                if newsfeed['date'] < ACTION_DAY_CHECK: continue

                if newsfeed['newsfeed_name'] == '대학교': A_NUM += 1
                elif newsfeed['newsfeed_name'] == '동아리&모임': B_NUM += 1
                elif newsfeed['newsfeed_name'] == '공모전&행사': C_NUM += 1
                elif newsfeed['newsfeed_name'] == '진로&구인': D_NUM += 1
                else: E_NUM += 1

                newsfeed_tag += newsfeed['tag']

        newsfeed_tag += (CATEGORY['대학교'] * A_NUM) + (
            CATEGORY['동아리&모임'] * B_NUM) + (CATEGORY['공모전&행사'] * C_NUM) + (
                CATEGORY['진로&구인'] * D_NUM) + (CATEGORY['커뮤니티'] * E_NUM)

        newsfeed_topic = LDA.get_topics(newsfeed_tag)

        #가중치 작업
        fav_tag *= SJ_FAV_TAG_WEIGHT
        view_tag *= SJ_VIEW_TAG_WEIGHT

        fav_topic *= SJ_FAV_TOPIC_WEIGHT
        view_topic *= SJ_VIEW_TOPIC_WEIGHT
        search_topic *= SJ_SEARCH_TOPIC_WEIGHT
        newsfeed_topic *= SJ_NEWSFEED_TOPIC_WEIGHT

        if len(USER['fav_list']) != 0:
            fav_topic /= len(USER['fav_list'])

        if len(USER['view_list']) != 0:
            view_topic /= len(USER['view_list'])

        #LDA Topic
        TOPIC_RESULT = (fav_topic + view_topic + search_topic +
                        newsfeed_topic) / SJ_TOPIC_RESULT_DIV

        #FASTTEXT
        FastText_doc = fav_doc + view_doc + search_doc

        if FastText_doc:
            USER_VERCTOR = FastText.get_doc_vector(fav_doc + view_doc +
                                                   search_doc).tolist()
        else:
            USER_VERCTOR = ft_vector = (np.zeros(FastText.VEC_SIZE)).tolist()

        #TAG
        tag_dict = dict(Counter(fav_tag + view_tag))
        tag_dict = sorted(tag_dict.items(), key=lambda x: x[1])

        #최종 태그들 오브젝트
        TAG_RESULT = {}

        if len(tag_dict) >= 50:
            if tag_dict[0][1] == 1:
                tag_dict[0][1] = 2

            TAG_RESULT[tag_dict[0][0]] = tag_dict[0][1]

            for i in range(1, 50):
                tag_dict[i] = list(tag_dict[i])

                if (tag_dict[i - 1][1] * 1.5) < tag_dict[i][1]:
                    tag_dict[i][1] = int(tag_dict[i - 1][1] * 1.5)

                TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1]

        elif len(tag_dict) > 0:
            if tag_dict[0][1] == 1:
                tag_dict[0][1] = 2

            TAG_RESULT[tag_dict[0][0]] = tag_dict[0][1]

            for i in range(1, len(tag_dict)):
                tag_dict[i] = list(tag_dict[i])

                if (tag_dict[i - 1][1] * 1.5) < tag_dict[i][1]:
                    tag_dict[i][1] = int(tag_dict[i - 1][1] * 1.5)

                TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1]

        USER_TAG_SUM = sum(TAG_RESULT.values())

        USER_TAG_SUM *= SJ_TAG_SUM_WEIGHT

        if USER_TAG_SUM == 0:
            USER_TAG_SUM = 1

        # 사용자 태그로 사용자 태그 벡터 구하기
        USER_TAGS = []
        for key, value in TAG_RESULT.items():
            USER_TAGS += [key] * value
        TAG_VECTOR = FastText.get_doc_vector(USER_TAGS).tolist()

        #해당 USER 관심도 갱신!
        update_user_measurement(
            db, USER['_id'], list(TOPIC_RESULT), TAG_RESULT, USER_TAG_SUM,
            TAG_VECTOR, USER_VERCTOR,
            len(USER['fav_list']) + len(USER['view_list']) +
            len(USER['search_list']))

    update_variable(db, 'renewal', datetime.now())

    if db_client is not None:
        db_client.close()

예제 #30

0

파일 보기

파일: main.py 프로젝트: ChrisChV/Sentiment_Analisys_Visualization_in_Topics_of_Social_Media

k_topics = 3
LDA_iterations = 500
sentimentPoints = getSentimentPoints()

dictionary, corpus, out_set = preprocessing(doc_set)

fileOut = open("out_dic", 'w')
print(dictionary, file=fileOut)
fileOut.close()

for i in range(0, len(out_set)):
    tweet_set[i].wordSet = out_set[i]

sentimentsOfTweets = getSentimentScoreOfTweets(out_set)

model = LDA(dictionary, corpus, k_topics, LDA_iterations)

#sentimentsOfTopics =  getSentimentsScoreOfTopics(out_set, model.get_topics(), dictionary)

print(sentimentsOfTweets)
#print(sentimentsOfTopics)

for i in range(0, len(sentimentsOfTweets)):
    tweet_set[i].russell_tuple = sentimentsOfTweets[i]

fileOut = open("out_model", 'w')
print(model.print_topics(num_topics=k_topics, num_words=10), file=fileOut)
fileOut.close()

#print(getStrOfSentiment(getPolaritySent(sentimentsOfTweets[0])))
#print(getStrOfSentiment(getPrimarySent(sentimentsOfTweets[0], sentimentPoints)))

예제 #31

0

파일 보기

def perform_lda(train_dataset, train_labelset, test_dataset):
    lda = LDA.LDA(train_dataset, train_labelset)
    projection_matrix, projected_train_data = lda.fit()
    print(np.shape(projection_matrix), np.shape(np.shape(test_dataset)))
    projected_test_data = lda.test_fit(projection_matrix, test_dataset)
    return projected_train_data, projected_test_data

예제 #32

0

파일 보기

파일: test_model.py 프로젝트: IRC-SPHERE/ADL-TM


# In[52]:

(wtp, lwtp, zd, totz) = model.GibbsSampler(widf, docs, rdf.word, T, ITER, lidf,lwd)


# In[55]:

fpath = path+model_name+'_T'+str(T)+'s'+str(min_durantion)+'/'

if not os.path.exists(fpath):
  os.makedirs(fpath)

if model_name == 'LDA':
  model.visualTopic(T, fpath, dataset, wtp)
else:
  util.visualTopic(T, fpath, dataset, wtp)

np.save(fpath+'wtp.npy',wtp)
if(len(lwtp)>0):
  np.save(fpath+'lwtp.npy',lwtp)

# In[56]:

if LW:
  (nwtp,nlwtp) = util.nomalise(wtp, lwtp)
elif model_name != 'LDA':
  (nwtp,nlwtp) = util.nomalise(wtp)
else:
  nlwtp=[]

예제 #33

0

파일 보기

파일: test.py 프로젝트: SZJShuffle/LatentDirichletAllocation

    print('K', lda.K)
    print('_uniqTermSet', lda._uniqTermSet)
    print('docsSize', lda._docNum)
    print('termSize', lda._termNum)
    print('Z ini:', lda.Z)
    print('docTopic ini', lda._docTopic)  ##4 doc,2topic
    print('lda.termTopic', lda._termTopic)
    print('lda.Phi', lda.Phi)
    print('lda.Theta', lda.Theta)


if __name__ == "__main__":
    corpus = [
        "With all of the critical success Downey had experienced throughout his career, he had not appeared in a blockbuster film. That changed in 2008 when Downey starred in two critically and commercially successful films, Iron Man and Tropic Thunder. In the article Ben Stiller wrote for Downey's entry in the 2008 edition of The Time 100, he offered an observation on Downey's commercially successful summer at the box office.",
        "On June 14, 2010, Downey and his wife Susan opened their own production company called Team Downey. Their first project was The Judge.",
        "Robert John Downey Jr. is an American actor, producer, and singer. His career has been characterized by critical and popular success in his youth, followed by a period of substance abuse and legal troubles, before a resurgence of commercial success in middle age.",
        "In 2008, Downey was named by Time magazine among the 100 most influential people in the world, and from 2013 to 2015, he was listed by Forbes as Hollywood's highest-paid actor. His films have grossed over $14.4 billion worldwide, making him the second highest-grossing box-office star of all time."
    ]

    X = [i.split(' ') for i in corpus]
    lda = LDA.LDA()
    lda.fit(X)

    printAttr(lda)

    #fig,ax= lda.plotDocTopicDist(2)

    #fig,ax = lda.plotTermTopicDist(2)

    #fig,ax = lda.plotTopicTermDist(1)
    plt.show()

예제 #34

0

파일 보기

파일: ldaModelCreatee.py 프로젝트: JeongHyeon-Kim/Extracting-Representative-Keyword-with-LDA-and-Word-Distance

    #tokens_ko = t.nouns(document1)
    #print(type(tokens_ko))
    #print(tokens_ko)
    split = []
    docRemovingStopWord = [i for i in tokens_ko if not i in koreanStopWord]
    texts.append(docRemovingStopWord)
    tokens_ko = []

#찾기 위해 선언한 변수(굳이 없어도 실행가능)
#ko = nltk.Text(tokens_ko, name='document')

#토큰으로 만든 데이터를 다시 list로 변환시킨다.
# texts.append(tokens_ko)

#토큰으로 만든 데이터 list를 사전으로 형성(각 토큰마다 id를 생성)
dictionary = LDA.Dictionary(texts)
#dictionary_path = "/home/ice-kms/LDAModel/iter_1000_Real_articleDic_10000_compound_topicNum_20.dict"
#corpora.Dictionary.save(dictionary, dictionary_path)

#문서-단어 행렬를 만들기 위해서 bag-of-word로 변형하는 과정(토큰화 데이터를 사용)
corpus = [dictionary.doc2bow(text) for text in texts]

#lda 모델 형성
#ldamodel = gensim.models.LdaMallet(corpus,num_topics=20,id2word=dictionary, passes=20)
ldamodel = LDA.ldamodel.LdaModel(corpus,
                                 num_topics=2,
                                 id2word=dictionary,
                                 passes=100,
                                 iterations=1000)

#print(len(ldamodel))

예제 #35

0

파일 보기

파일: faceTest.py 프로젝트: trydoings/Face_Recognition

def testPic(dataMat, label):
    # def testPic(dataMat, label, disc_set, disc_value, redVects, Train_LDA):
    print("thread")
    j = 0
    isRight = 0
    isRight2 = 0
    testTimes = 0
    while True:
        testImgSet = './pic/s0.bmp'
        if not os.path.isfile(testImgSet):
            continue

        disc_set, disc_value, meanFace = LDA.pca(dataMat, 40)
        redVects, Train_LDA = LDA.lda(dataMat, label, 40, 17, 11,
                                      11 * 17)  # LDA投影空间，最终的训练集

        # testImgSet = createImageSet.createTestMat('Yale', testInClass, testNum, testInClass, 100 * 100)
        testImgSet = ImageSet.HistogramEqualization(testImgSet)
        # print("shape", testImgSet.shape)
        testImgSet = np.reshape(testImgSet, (-1, 1))
        testImgSet = disc_set.T.dot(testImgSet)
        testImgSet = redVects.T.dot(testImgSet)
        disList = []
        testVec = np.reshape(testImgSet, (1, -1))
        for sample in Train_LDA.T:
            disList.append(np.linalg.norm(testVec - sample))
        # print('disList', disList)
        sortIndex = np.argsort(disList)
        print(label[sortIndex[0]])
        if 16 == int(label[sortIndex[0]]):
            isRight = isRight + 1
        if 17 == int(label[sortIndex[0]]):
            isRight2 = isRight2 + 1
        os.remove('./pic/s0.bmp')
        j = j + 1
        # j = j + 1
        # 在脸上检测眼睛   (40, 40)是设置最小尺寸，再小的部分会不检测
        # eyes = eye_cascade.detectMultiScale(roi_gray, 1.03, 5, 0, (40, 40))
        # 把眼睛画出来
        # for(ex, ey, ew, eh) in eyes:
        #     cv2.rectangle(img, (x+ex, y+ey), (x+ex+ew, y+ey+eh), (0, 255, 0), 2)
        if j == 5:
            if isRight >= 4 or isRight2 >= 4:
                if isRight >= 4:
                    print("欢迎你，史长顺！")
                    # break
                    camera.release()
                    cv2.destroyAllWindows()
                    break
                if isRight2 >= 4:
                    print("欢迎你，饶丝雨！")
                    # break
                    camera.release()
                    cv2.destroyAllWindows()
                    break
            else:
                if isRight < 4:
                    isRight = 0
                    testTimes += 1
                    print("测试失败")
                    if testTimes >= 5:
                        # break
                        camera.release()
                    j = 0
                if isRight2 < 4:
                    isRight2 = 0
                    testTimes += 1
                    print("测试失败2")
                    if testTimes >= 5:
                        # break
                        camera.release()
                    j = 0

예제 #36

0

파일 보기

def measurement_run():
	db_client = MongoClient('mongodb://%s:%s@%s' %(MONGODB_ID, MONGODB_PW, MONGODB_HOST))
	db = db_client["soojle"]

	renewal_time = find_variable(db, 'renewal')

	#리뉴얼 시간보다 이상인 사람만 측정! (관심도 측정이 될 지표의 변동이 생겼다는 뜻!)
	USER_list = find_user_renewal(db, renewal_time)
	USER_list = list(USER_list)

	for USER in USER_list:
		fav_tag = []
		view_tag = []
		newsfeed_tag = []
		fav_token = []
		view_token = []
		search_list = []

		#사용자가 관심 기능을 수행한 게시물 ##########################
		fav_topic = (np.zeros(LDA.NUM_TOPICS))
		for fav in USER['fav_list']:
			fav_topic += fav['topic']
			fav_tag += fav['tag']
			fav_token += fav['token']

		#FAS 구하기
		fav_doc = (fav_tag + fav_token) * 2

		#사용자가 접근을 수행한 게시물 ##############################
		view_topic = (np.zeros(LDA.NUM_TOPICS))
		for view in USER['view_list']:
			view_topic += view['topic']
			view_tag += view['tag']
			view_token += view['token']

		#FAS 구하기
		view_doc = view_tag + view_token

		#사용자가 검색을 수행한 키워드 ##############################
		for search_obj in USER['search_list'][:SJ_SEARCH_MEASURE_NUM]:
			search_list += search_obj['tokenizer_split']
		
		search_topic = LDA.get_topics(search_list)
		search_doc = search_list

		#사용자가 접근한 뉴스피드 ################################
		for newsfeed in USER['newsfeed_list']:
			newsfeed_tag += newsfeed['tag']

		newsfeed_topic = LDA.get_topics(newsfeed_tag)


		#가중치 작업
		fav_tag *= SJ_FAV_TAG_WEIGHT
		view_tag *= SJ_VIEW_TAG_WEIGHT
		
		fav_topic *= SJ_FAV_TOPIC_WEIGHT
		view_topic *= SJ_VIEW_TOPIC_WEIGHT
		search_topic *= SJ_SEARCH_TOPIC_WEIGHT
		newsfeed_topic *= SJ_NEWSFEED_TOPIC_WEIGHT

		if len(USER['fav_list']) != 0:
			fav_topic /= len(USER['fav_list'])
		
		if len(USER['view_list']) != 0:
			view_topic /= len(USER['view_list'])

		#LDA Topic
		TOPIC_RESULT = (fav_topic + view_topic + search_topic + newsfeed_topic)/SJ_TOPIC_RESULT_DIV

		#FASTTEXT
		FastText_doc = fav_doc + view_doc + search_doc

		if FastText_doc:
			USER_VERCTOR = FastText.get_doc_vector(fav_doc + view_doc + search_doc).tolist()
		else:
			USER_VERCTOR = ft_vector = (np.zeros(FastText.VEC_SIZE)).tolist()
			
		#TAG
		tag_dict = dict(Counter(fav_tag + view_tag))
		tag_dict = sorted(tag_dict.items(), key=lambda x: x[1], reverse = True)
		
		#빈도수 랭킹 상위 X위 까지 보관.
		TAG_RESULT = {}

		if len(tag_dict) >= 10:
			for i in range(10):
				TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1]
		else:
			for i in range(len(tag_dict)):
				TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1]
					
		USER_TAG_SUM = sum(TAG_RESULT.values())

		#1.5배 증가
		USER_TAG_SUM *= SJ_TAG_SUM_WEIGHT

		#만약 TAG_SUM 이 0이면 1로 설정.
		if USER_TAG_SUM == 0:
			USER_TAG_SUM = 1

		#해당 USER 관심도 갱신! (관심도 측정 횟수 +1)
		update_user_measurement(db, USER['_id'], list(TOPIC_RESULT), TAG_RESULT, USER_TAG_SUM, USER_VERCTOR, USER['measurement_num']+1)

	update_variable(db, 'renewal', datetime.now())

	if db_client is not None:
		db_client.close()

Python LDA, NP_ML 예제들