Exemple #1
0
def test_LDA():
    from LDA import LDA
    x = [[2.95, 6.63], [2.53, 7.79], [3.57, 5.65], [3.16, 5.47], [2.58, 4.46],
         [2.16, 6.22], [3.27, 3.52]]
    e = copy.deepcopy(x)
    y = [1, 1, 1, 1, 2, 2, 2]
    t = LDA(x, y)
    for a in e:
        r = t.predict(a)
        print max(r, key=r.get)
Exemple #2
0
def lda (text):
    
    lda = LDA (text)
    lda.setTopics (10)
    lda.buildModel ()
    lda.writeTopics ()
    vector = lda.getTopics ()
    del lda
    
    return vector
Exemple #3
0
def test_LDA():
    from LDA import LDA
    x = [
        [2.95, 6.63],
        [2.53, 7.79],
        [3.57, 5.65],
        [3.16, 5.47],
        [2.58, 4.46],
        [2.16, 6.22],
        [3.27, 3.52]
    ]
    e = copy.deepcopy(x)
    y = [1,1,1,1,2,2,2]
    t = LDA(x, y)
    for a in e:
        r = t.predict(a)
        print max(r, key=r.get)
Exemple #4
0
 def get_model(self, model_type, fo_lang_code, docs):
     model = None
     if model_type == "vsm":
         model = VSM(fo_lang_code=fo_lang_code)
         model.build_model(docs)
     elif model_type == "lda":
         model = LDA(fo_lang_code=fo_lang_code)
         model.build_model(docs, num_topics=60, passes=100)
     return model
Exemple #5
0
    def fit(self, X, y):
        self.pca = PCA(n_components=self.pca_components).fit(X)
        pca_projected = self.pca.project(X)

        self.lda = LDA(n_components=self.n_components).fit(pca_projected, y)

        self.subspace = np.dot(self.pca.pro_subspace, self.lda.pro_subspace)

        return self
 def fit(self,data):
     '''训练模型参数,data = np.array([[value1,value2,...,label],...])'''
     model = LDA(k=self.k)
     new_data,self.W = model.fit(data) # new_data是每行一条数据,最后一维度是类别
     new_data = np.array(new_data) # matrix -> np.ndarray
     print('降维后的前3条数据为',new_data[:3])
     # 按类别将降维后的数据分类
     m,n = new_data.shape # m条数据,每条数据由n个维度,其中最后一维度是类别
     # 将数据按类别划分
     dataDict = defaultdict(list)
     # 遍历数据
     for i in range(m):
         dataDict[new_data[i,-1]].append(new_data[i,:-1]) # 去除类别标签
     self.classes = list(dataDict.keys()) # 存储类别
     # 计算每个类别的均值向量,协方差矩阵和协方差矩阵的行列式的值
     for label in dataDict.keys():
         temp = np.array(dataDict[label]) # 每行为一条数据
         mean = temp.mean(axis=0,keepdims=True) # 求均值向量,保持二维
         var = temp.var(axis=0,keepdims=True)
         self.parameters['class' + str(label)] = {'mean':mean,'var':var}
Exemple #7
0
class TR_LDA:
    def __init__(self, fo_lang_code, trans_agent):
        self.lda = LDA(fo_lang_code)
        self.trans_agent = trans_agent

    def train(self, docs, num_topics=5, passes=100):
        # docs = [doc.split() for doc in docs]
        trans_docs = []
        for doc in docs:
            trans_doc = translate_sentences(doc, "en")
            trans_docs.append(trans_doc)
        self.lda.train(trans_docs, num_topics=num_topics, passes=passes)

    def get_doc_similarity(self, doc1, doc2):
        en_doc1 = self.trans_agent.get_translated_doc(doc1)
        en_doc2 = self.trans_agent.get_translated_doc(doc2)
        return self.lda.get_doc_similarity(en_doc1, en_doc2)

    def get_model_name(self):
        return "TR-LDA"
Exemple #8
0
 def fit(self,data):
     '''训练模型参数,data = np.array([[value1,value2,...,label],...])'''
     model = LDA(k=self.k)
     new_data,self.W = model.fit(data) # new_data是每行一条数据,最后一维度是类别
     new_data = np.array(new_data) # matrix -> np.ndarray
     # 按类别将降维后的数据分类
     m,n = new_data.shape # m条数据,每条数据由n个维度,其中最后一维度是类别
     # 将数据按类别划分
     dataDict = defaultdict(list)
     # 遍历数据
     for i in range(m):
         dataDict[new_data[i,-1]].append(new_data[i,:-1]) # 去除类别标签
     self.classes = list(dataDict.keys()) # 存储类别
     # 计算每个类别的均值向量,协方差矩阵和协方差矩阵的行列式的值
     for label in dataDict.keys():
         temp = np.array(dataDict[label]).T # 转为每列为一条数据
         mean = temp.mean(axis=1,keepdims=True) # 求均值向量,保持二维
         covariance = (temp - mean).dot((temp - mean).T) # 求协方差矩阵,shape=(n,n),n是特征的个数
         determinant = np.linalg.det(covariance) # 求协方差矩阵的行列式
         inverse_cov = np.mat(covariance).I # 协方差矩阵的逆矩阵
         self.parameters['class' + str(label)] = {'mean':mean,'inverse_cov':inverse_cov,'determinant':determinant}
Exemple #9
0
 def create_features(self):
     cate_col = ['col9',
                 'col609',
                 'col1317',
                 'col1430',
                 'col1975',
                 'col2135',
                 'col3213',
                 'col3289',
                 'col3290',
                 'col3519',
                 'col3591']
     
     co_feature1 = [(cate_col[i],cate_col[j]) for i in range(len(cate_col)) for j in range(i)]
     co_feature2 = [(cate_col[j],cate_col[i]) for i in range(len(cate_col)) for j in range(i)]
     co_feature = co_feature1 + co_feature2
     n_topics,n_lda_fe = 3,10
     lda = LDA(co_feature,n_topics,n_lda_fe)
     lda.fit(train)
     train_lda = lda.transform(train,cate_col)
     test_lda = lda.transform(test,cate_col)
     new_col = [f"{c}_topic{i}" for c in co_feature for i in range(n_topics)]
     for i,c in enumerate(new_col):
         self.train[c]= train_lda[:,i]
         self.test[c] = test_lda[:,i]
Exemple #10
0
def analysisEM_GMM(cryo_data, use_PCA=True, normalize=True, title="EM_GMM Results"):
    ### Define a seed that doesn't push two Gaussians right next to each other
    np.random.seed(1)

    ### Reduce dimensionality to 2D
    new_data = []
    if use_PCA:
        new_data = PCA(cryo_data, normalize=normalize)
    else: ### use_LDA
        new_data = LDA(cryo_data, user_dims=2, normalize=normalize)

    ### Run the EM_GMM algorithm to attempt to classify our data points
    EM_GMM(new_data, cryo_data.iloc[:,-1], 2, max_iters=10, title=title)
Exemple #11
0
def analysisLDA(cryo_data, normalize=True):
    ### Get results on my own PCA on this dataset
    new_data = LDA(cryo_data, user_dims=2, normalize=normalize)
    plotResults_2D(new_data, cryo_data.iloc[:,-1], 'Custom LDA Results on cryo Dataset - Normalized = '+str(normalize))

    ### Get results to compare to using the sklearn version of LDA on this dataset
    lda = sklearn_LDA(n_components=2)
    if normalize:
        sklearn_data     = sklearn_SS().fit_transform(cryo_data.iloc[:,:-1])
        sklearn_new_data = lda.fit_transform(sklearn_data, cryo_data.iloc[:,-1])
    else:
        sklearn_new_data = lda.fit_transform(cryo_data.iloc[:,:-1], cryo_data.iloc[:,-1])
    plotResults_2D(pd.DataFrame(sklearn_new_data), cryo_data.iloc[:,-1], 'Sklearn LDA Results on cryo Dataset - Normalized = '+str(normalize))
Exemple #12
0
def Topic2Vec_v2():
    """
	分析句子在,将句子转换为topic 向量
	:return:
	"""
    lda = LDA()
    sentences = ReadFile.readTXTFile(config.BTMData +
                                     "topic_data_processed.txt")
    docs = []
    lab = []
    for index, line in enumerate(sentences):
        term = line.strip().split("\t")
        if len(term) != 3:
            continue

        docs.append(term[1])
        lab.append(term[2])
    documents = line_Cut_Word(docs)
    documents = [" ".join(doc) for doc in documents]

    lda.load_word_dic()
    lda.load_LdaModel()
    # lda.build_word_dic(lines(documents))
    # print len(lda.word_dic.keys())
    # lda.buildModel(lines(documents))

    result_lab = []
    topic2vec = []
    x_index, y_index = [], []
    count = 0
    print len(lab)
    for index, doc_lab in enumerate(list(zip(docs, lab))):
        if index % 1000 == 0 and index != 0:
            print doc_lab[0], doc_lab[1]
            # break
        doc = doc_lab[0]
        la = doc_lab[1]
        topics = lda.getQuerySimilarly(doc)

        if topics:
            # print doc, "\t", la
            for topic in topics:
                x_index.append(count)
                y_index.append(topic[0])
                topic2vec.append(topic[1])
            count += 1
            result_lab.append(la)

    print len(x_index), len(y_index), len(topic2vec), len(result_lab), count

    result = [x_index, y_index, topic2vec, result_lab]
    with open(config.BTMData + "topic2vec_2.txt", 'wb') as fp:
        cPickle.dump(result, fp)
Exemple #13
0
 def get_model(self, model_type, fo_lang_code, docs):
     model = None
     if model_type == "vsm":
         model = VSM(fo_lang_code=fo_lang_code)
         model.build_model(docs)
     elif model_type == "lda":
         model = LDA(fo_lang_code=fo_lang_code)
         model.build_model(docs, num_topics=60, passes=100)
     elif model_type == "gvsm":
         model = GVSM(fo_lang_code=fo_lang_code,
                      term_similarity_type=self.term_similarity_type)
         model.build_model(docs)
     elif model_type == "lsi":
         model = LSI(fo_lang_code=fo_lang_code)
         model.build_model(docs, num_topics=60)
     return model
def cross_validation(X, Y, folds=5, split_value=0.3, name="lda"):
    # Y = Y.reshape((len(Y), 1))
    # X = np.hstack((X, Y))
    # part = -1
    #
    # if split:
    #     part = split_value
    # else:
    #     part = int(np.math.ceil(len(X) / folds))
    # scores = []
    #
    # for i in range(folds):
    #     test = np.array(X[i * part: (i + 1) * part])
    #     test = [list(d) for d in test]
    #     train = [np.array(j) for j in X if list(j) not in test]
    #     test = np.array(test)
    #     train = np.array(train)
    #
    #     train_x, train_y = train[:, :-1], train[:, -1]
    #     test_x, test_y = test[:, :-1], test[:, -1]
    #
    #     print(train_x.shape)
    #     print(test_x.shape)

    scores = []
    for fold in range(folds):
        train_x, test_x, train_y, test_y = train_test_split(
            X, Y, shuffle=True, test_size=split_value)

        if name == "lda":
            lda = LDA()
            lda.fit(train_x, train_y)

            lda_train_x = lda.transform(train_x)
            lda_test_x = lda.transform(test_x)
        else:
            pca = PCA()
            pca.fit(train_x)

            pca_train_x = pca.transform(train_x)
            pca_test_x = pca.transform(test_x)
        '''classifier'''
        lr = LogisticRegression(solver='saga', n_jobs=4)
        lr.fit(train_x, train_y)
        score = lr.score(test_x, test_y)
        scores.append(score)
        print("accuracy on  fold ", fold, " : ", score)

    mean = np.mean(scores)
    std = np.std(scores)
    print("mean accuracy : ", mean)
    print("standard deviation : ", std)

    return mean, std, scores
Exemple #15
0
def Topic2Vec_v2():
	"""
	分析句子在,将句子转换为topic 向量
	:return:
	"""
	lda = LDA()
	sentences = ReadFile.readTXTFile(config.BTMData + "topic_data_processed.txt")
	docs = []
	lab = []
	for index, line in enumerate(sentences):
		term = line.strip().split("\t")
		if len(term) != 3:
			continue

		docs.append(term[1])
		lab.append(term[2])
	documents = line_Cut_Word(docs)
	documents = [" ".join(doc) for doc in documents]

	lda.load_word_dic()
	lda.load_LdaModel()
	# lda.build_word_dic(lines(documents))
	# print len(lda.word_dic.keys())
	# lda.buildModel(lines(documents))

	result_lab = []
	topic2vec = []
	x_index, y_index = [], []
	count = 0
	print len(lab)
	for index, doc_lab in enumerate(list(zip(docs, lab))):
		if index % 1000 == 0 and index != 0:
			print doc_lab[0], doc_lab[1]
			# break
		doc = doc_lab[0]
		la = doc_lab[1]
		topics = lda.getQuerySimilarly(doc)

		if topics:
			# print doc, "\t", la
			for topic in topics:
				x_index.append(count)
				y_index.append(topic[0])
				topic2vec.append(topic[1])
			count += 1
			result_lab.append(la)

	print len(x_index), len(y_index), len(topic2vec), len(result_lab), count

	result = [x_index, y_index, topic2vec, result_lab]
	with open(config.BTMData + "topic2vec_2.txt", 'wb') as fp:
		cPickle.dump(result, fp)
Exemple #16
0
def train_LDA():
    x_train, y_train, x_test, y_test = get_test_train()
    lda = LDA()
    lda.fit(x_train, y_train)
    y_prediction = np.zeros(len(y_test))
    for i, x in enumerate(x_test):
        y_prediction[i] = lda.predict(x)
    print(y_prediction)
    print(error_rate(y_prediction, y_test))
    y_train_pred = np.zeros(len(y_train))
    for i, x in enumerate(x_train):
        y_train_pred[i] = lda.predict(x)
    print(error_rate(y_train_pred, y_train))
def run_cm1_exp2_LDA():
    """
    Based on Jane's suggestion: evaluate the result on the impacted artifacts only
    :return:
    """
    cm1_exp2 = CM1_Experiment2(word_threshold=5)
    # cm1_exp2.select_word_all_doc_threshold()
    cm1_exp2.replace_words_in_target_artifacts()
    cm1_exp2.get_impacted_links()

    vsm = LDA(fo_lang_code="en")
    vsm.train(cm1_exp2.get_docs(), num_topics=20)

    vsm_replace = LDA(fo_lang_code="en")
    vsm_replace.train(cm1_exp2.get_docs(), num_topics=20)

    cm1_exp2.run_origin_model(vsm)
    cm1_exp2.run_replaced_model(vsm_replace)

    cm1_exp2.eval_and_compare()
Exemple #18
0
def kFoldCrossValidate(dataset, classificationModel, numFolds=5, shuffle='off', alpha=0.0002, steps=10000):
    """
    numFolder               = number of folds (default: 5)
    dataset                 = numpy array of preprocessed dataset with last column = labels
    classificationModel     = "LDA" or "LR"
    shuffle                 = 'on' or 'off' - if shuffle is 'on', shuffle rows using np.random.shuffle
    alpha                   = learning rate for logistic regression
    Split dataset into numFolds (default: 5) equal sections, train model on the other
    numFolds - 1 (default: 4) folds and return average accuracy (as a float decimal) over numFolds folds.

    """
    if classificationModel!='LDA' and classificationModel!='LR':
        return -1 #error

    if shuffle=='on':
        np.random.shuffle(dataset)

    foldsList = np.array_split(dataset, numFolds) #np.array_split splits dataset into a list of numFolds number of folds
    totalAccuracy = 0

    for currentFoldIndex in range(numFolds):

        validationData = foldsList[currentFoldIndex] #assign current fold to validationData
        del foldsList[currentFoldIndex] #remove the current fold from the list...
        trainingData = np.vstack(foldsList)
        # vertically stack the remaining elements in the list creating a matrix of
        # the dataset with validation data removed --> creating the training set
        foldsList.insert(currentFoldIndex, validationData) # add it back at the same index to leave the list unchanged

        if classificationModel=='LDA':
            model = LDA(data=trainingData)
            model.fit()
        elif classificationModel=='LR':
            model = LogisticRegression(data=trainingData)
            model.fit(steps=steps, alpha=alpha)
        X_test = validationData[:,:-1] #remove last col of validationData
        y_predict = model.predict(X_test=X_test)
        y_test = validationData[:,-1][:,np.newaxis] #last col of validationData
        totalAccuracy += utils.evaluate_acc(y_predict=y_predict, y_test=y_test)

    return totalAccuracy / numFolds
def LoadTextData(Country,gallery_id):
    S ,Data  = Load_GalLery_Textual_Data(Country,gallery_id)
    
    S1 ,Data1  = Load_GoogleVision_Labels(Country,gallery_id)
    
    labels = [Preprocessing(x['label']) for x in S1[0]]
    labels.append(Preprocessing(S1[1]))
    
    DocList = S[1]
    DocList.append(S[0])

    for s in S[2]:
        DocList.extend(s)
    
    data_lemmatized = PrepareData(DocList)
    lda_model,id2word,corpus = LDA(data_lemmatized,num_topics=20)#len(labels))
    Topic_Words = Topics_Words(lda_model,num_words=len(labels))   
    
    return Topic_Words,labels
def kfold(data, k, m):
    # shuffles the data and group them into k groups
    numpy.random.shuffle(data)
    groups = numpy.array_split(data, k, axis=0)
    # averages the acuracy of k predictions
    acc = 0
    for i in range(k):
        val_set = groups[i][:, 0:-1]
        true_val = groups[i][:, -1]
        train_set = numpy.concatenate(groups[:i] + groups[i + 1:], axis=0)
        # build a model using the training set
        if m == Model.logreg:
            model = LogReg(train_set)
            w = model.fit(0.1, 100)
        elif m == Model.lda:
            model = LDA(train_set)
            w = model.fit(model.cvinv, model.u)
        r = model.predict(val_set, w)
        acc += evaluate_acc(r, true_val)
    acc /= k
    return acc
Exemple #21
0
    def __init__(self, data):
        self.data = data
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.ws = WordShape()

        # train and predict word2vec clustering
        print('training word2vec ...')
        for doc in data:
            for word in doc:
                self.tokens.append(word[0])
        w2v = word2vec(self.tokens, 5)
        w2v.train()
        self.w2v_dict = dict(zip(self.tokens, w2v.predict()))

        print('train brown clustering ...')
        brown_wrapper = BrownWrapper(data)
        self.brown_dict = brown_wrapper.get_brown_clustering()

        print('train LDA topic clustering ...')
        self.lda = LDA(self.tokens)

        print('\nextracting features ...\n')
Exemple #22
0
    def get_trained_model(self):

        if self.model_name == 'LDA':
            self.model = LDA(self.x_train, self.y_train, self.x_test,
                             self.y_test)
            self.model.train_model()

        elif self.model_name == 'LR':
            self.model = LogisticReg(self.x_train, self.y_train, self.x_test,
                                     self.y_test)
            self.model.train_model()

        elif self.model_name == 'MLP':
            self.model = MultiLayerPerceptron(self.x_train, self.y_train,
                                              self.x_test, self.y_test)
            self.model.train_model()

        elif self.model_name == 'SVM':
            self.model = SupportVectorMachine(self.x_train, self.y_train,
                                              self.x_test, self.y_test)
            self.model.train_model()

        return self.model.get_model()
Exemple #23
0
import numpy as np
from generate_data import gen_data
from LDA import LDA
import matplotlib.pyplot as plt

d1, d2, c1, c2 = gen_data(10)
w, mu1, mu2 = LDA(d1, d2)
w = w / np.linalg.norm(w)
#print(np.linalg.norm(w))
a = w[1] / w[0]

plt.figure(1)
plt.axis('equal')
x = np.linspace(-8, 8, 100)
y = a * x
plt.plot(x, y)

for x1, wx1 in zip(d1, d1.dot(w)):
    #print(wx1)
    y1 = wx1 * w
    plt.scatter(x1[0], x1[1], color='r')
    plt.scatter(y1[0], y1[1], color='g')
    plt.plot([x1[0], y1[0]], [x1[1], y1[1]], color='y', linestyle='--')

for x1, wx1 in zip(d2, d2.dot(w)):
    #print(wx1)
    y1 = wx1 * w
    plt.scatter(x1[0], x1[1], color='b')
    plt.scatter(y1[0], y1[1], color='skyblue')
    plt.plot([x1[0], y1[0]], [x1[1], y1[1]], color='y', linestyle='--')
Exemple #24
0
def lda():

    path_hasil = 'static/assets/LDA/hasil/'

    if request.method == 'POST':
        id_time = datetime.datetime.now().time().microsecond

        input_data_test_ke = (request.form['datatest_input'])
        pilih_metode = str(request.form['pilih_metode'])

        # print(pilih_metode,type(pilih_metode))
        data = ORL_face.data
        data_train = ORL_face.data_train
        data_test = ORL_face.data_test

        if input_data_test_ke != "SEMUA":

            data_test_ke = data_test[int(input_data_test_ke)]
            orang, pose = LDA(input_data_test_ke).calc_lda(
                data_test_ke, pilih_metode)
            temp_datatest = np.reshape(data_test_ke,
                                       (data.shape[2], data.shape[3]))
            result = Image.new("RGB", (92 * 2, 112))

            img_train = (Image.fromarray(data[orang[0] - 1][pose[0] - 1]))
            x = 0
            y = 0
            w, h = img_train.size
            # print('pos {0},{1} size {2},{3}'.format(x, y, w, h))
            result.paste(img_train, (x, y, x + w, y + h))

            img_test = (Image.fromarray(temp_datatest))
            x = (1) * 92
            y = (1) * 0
            w, h = img_test.size
            # print('pos {0},{1} size {2},{3}'.format(x, y, w, h))
            result.paste(img_test, (x, y, x + w, y + h))
            filename_img = str(id_time) + "_" + pilih_metode + "_" + str(1)
            result.save('static/assets/LDA/hasil/' + filename_img + '.png')
            hasil_keterangan = PCA.eval(None, int(input_data_test_ke),
                                        orang - 1)
            return render_template('LDA.html',
                                   query_path=input_data_test_ke,
                                   img_hasil=filename_img,
                                   keterangan=hasil_keterangan)

        elif input_data_test_ke == "SEMUA":
            temp_benar = 0
            temp_salah = 0
            temp_keterangan = []
            temp_img = []
            for input_data_test_ke in range((ORL_face.data_test.shape[0])):
                data_test_ke = data_test[int(input_data_test_ke)]
                orang, pose = LDA(input_data_test_ke).calc_lda(
                    data_test_ke, pilih_metode)
                temp_datatest = np.reshape(data_test_ke,
                                           (data.shape[2], data.shape[3]))
                result = Image.new("RGB", (92 * 2, 112))

                img_train = (Image.fromarray(data[orang[0] - 1][pose[0] - 1]))
                x = 0
                y = 0
                w, h = img_train.size
                # print('pos {0},{1} size {2},{3}'.format(x, y, w, h))
                result.paste(img_train, (x, y, x + w, y + h))

                img_test = (Image.fromarray(temp_datatest))
                x = (1) * 92
                y = (1) * 0
                w, h = img_test.size
                # print('pos {0},{1} size {2},{3}'.format(x, y, w, h))
                result.paste(img_test, (x, y, x + w, y + h))
                filename_img = str(id_time) + "__" + pilih_metode + "_" + str(
                    input_data_test_ke + 1)
                result.save('static/assets/LDA/hasil/' + filename_img + '.png')
                hasil_keterangan = PCA.eval(None, int(input_data_test_ke),
                                            orang - 1)
                temp_keterangan.append(hasil_keterangan)
                temp_img.append(filename_img)
                if hasil_keterangan == 'benar':
                    temp_benar += 1
                else:
                    temp_salah += 1

            hasil_akurasi = (temp_benar / (temp_benar + temp_salah)) * 100
            return render_template('LDA.html',
                                   query_path="SEMUA",
                                   len_data_test=ORL_face.data_test.shape[0],
                                   img=temp_img,
                                   akurasi=hasil_akurasi,
                                   keterangan=temp_keterangan)
    else:
        return render_template('LDA.html')
from LDA import LDA
from QDA import QDA
import scipy.io as sio
import csv

# problem 6d

data = sio.loadmat('spam/spam_data_2.mat')
train_X = data['training_data']
train_y = data['training_labels'][0]
test_X = data['test_data']
# validate_X = train_X[:5000,:]
# validate_y = train_y[:5000]
# train_X = train_X[5000:,:]
# train_y = train_y[5000:]

cls_lda = LDA(2, 104, 0.0001)
cls_lda.fit(train_X, train_y)
y_predict = cls_lda.predict(test_X)

# error = 0
# for i in range(validate_X.shape[0]):
#     error += (int(validate_y[i]) != int(y_predict[i]))
# error /= validate_X.shape[0]
# print(error)

with open('spam_predict.csv', 'wt') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(['Id', 'Category'])
    for i in range(y_predict.shape[0]):
        writer.writerow([i, int(y_predict[i][0])])
Exemple #26
0
dump_file = (
    "/home1/roy/QGen/DGen/Dataset/enwiki-latest-pages-articles.xml.bz2"
)  # for generate_input_files
bow_path = ("/home1/roy/QGen/DGen/CDC/data/full_wiki_bow.mm"
            )  # doc to [(word_id, count)..] mapping
dict_path = ("/home1/roy/QGen/DGen/CDC/data/full_wiki.dictionary"
             )  # word_id to word mapping
model_file = (
    "/home1/roy/QGen/DGen/CDC/models/ldamodel_topics100_trainiter20_full_en.gensim"
)
num_topics = 100

id2word_dict = gensim.corpora.Dictionary.load(dict_path)
# print(id2word_dict.token2id.items()[:100])

lda = LDA()
debug = False
lda.load(model_file)
print("Load LDA model")
conceptualizer = Conceptualizer.Conceptualizer(lda)


def search_e_from_c(c, concept, k):
    """
    Find all entities under concept
    :param c: the database cursor
    :param concept: concept to be searched
    :param k: maximum number of entities to be generated
    :return: a sorted list containing (entity_name, frequency) pairs
    """
    cursor = c.execute(
    label_test = np.zeros((ntest, 1))
    for i in range(0, c):
        for j in range(0, n):
            x_train[:, n * i + j] = np.reshape(t[:, j, i], (d, 1))
            label_train[n * i + j] = i
        for j in range(0, (21 - n)):
            x_test[:, (21 - n) * i + j] = np.reshape(t[:, n + j, i], (d, 1))
            label_test[(21 - n) * i + j] = i
''' dimensionality reduction or original data'''
if opt == 1:
    x_train, x_test = x_train, x_test
elif opt == 2:
    x_train, x_test = PCA(x_train, x_test)
    d, _ = x_train.shape
elif opt == 3:
    x_train, x_test = LDA(x_train, x_test, c, n)
    d, _ = x_train.shape
''' estimating mean and variance from train data'''

mean_train = np.matrix(np.zeros((d, c)), dtype=complex)
for k in range(0, c):
    for l in range(0, n):
        mean_train[:, k] = mean_train[:, k] + x_train[:, n * (k) + l]
    mean_train[:, k] = (1 / n) * mean_train[:, k]

cov_train = np.zeros((d, d, c), dtype=complex)
cov_inv = np.zeros((d, d, c), dtype=complex)
for a in range(0, c):
    for b in range(0, n):
        cov_train[:, :, a] = cov_train[:, :, a] + (
            (x_train[:, n * (a) + b] - mean_train[:, a]) *
Exemple #28
0
def train_lad():
	lda = LDA()
	sentences = ReadFile.readTXTFile(config.BTMData + "btm_text_corpus.txt")
	# line = LineSetence(sentences=sentences)
	lda.buildModel(lines(sentences), num_topics=21)
def start(args):

    if args.start == "Cold":
        tdm = TwintDataMiner()
        attributes = ["id", "tweet", "place", "date"]

        before_df = get_before_tweets(attributes=attributes,
                                      tdm=TwintDataMiner())
        after_df = get_after_tweets(attributes=attributes,
                                    tdm=TwintDataMiner())

        print(before_df.shape)
        print(before_df.head)

        print(after_df.shape)
        print(after_df.head)

        before_df.drop_duplicates(subset='id')
        print(before_df.shape)
        print("----")
        after_df.drop_duplicates(subset='id')
        print(after_df.shape)

        # LDA declarations
        lda_before_orig = LDA()
        lda_after_orig = LDA()
        lda_before_bow = LDA()
        lda_after_bow = LDA()

        # Personalised BOW
        print("Personalised BOW")
        lemmatised_vocab = pd.DataFrame()
        lemmatised_vocab['column'] = [final_bow]
        lda_after_bow.dict_from_vocab(doc=lemmatised_vocab.iloc[0])
        lda_before_bow.dict_from_vocab(doc=lemmatised_vocab.iloc[0])
        print("BOW created")

        print("Processing docs...")
        before_docs = process_docs(before_df, lda_before_bow)
        print("Processed before docs.")
        after_docs = process_docs(after_df, lda_after_bow)
        print("Processed after docs.")

        # Original
        lda_before_orig.dict_from_vocab(doc=before_docs)
        lda_after_orig.dict_from_vocab(doc=after_docs)

        # Store before and after docs
        store_as_pickle("before_docs", before_docs)
        store_as_pickle("after_docs", after_docs)

        #Store lda objects
        store_as_pickle("lda_before_orig", lda_before_orig)
        store_as_pickle("lda_after_orig", lda_after_orig)
        store_as_pickle("lda_before_bow", lda_before_bow)
        store_as_pickle("lda_after_bow", lda_after_bow)

        #calculate corpus'
        bow_corpus_before_bow = lda_before_bow.generateBagOfWords(
            docs=before_docs)
        bow_corpus_after_bow = lda_after_bow.generateBagOfWords(
            docs=after_docs)

        bow_corpus_before_orig = lda_before_orig.generateBagOfWords(
            docs=before_docs)
        bow_corpus_after_orig = lda_after_orig.generateBagOfWords(
            docs=after_docs)

        # store corpus'
        store_as_pickle(name="bow_corpus_before_bow",
                        obj=bow_corpus_before_bow)
        store_as_pickle(name="bow_corpus_after_bow", obj=bow_corpus_after_bow)

        store_as_pickle(name="bow_corpus_before_orig",
                        obj=bow_corpus_before_orig)
        store_as_pickle(name="bow_corpus_after_orig",
                        obj=bow_corpus_after_orig)

        # calculate tfidfs
        corpus_tfidf_before_orig = lda_before_orig.tf_idf(
            bow_corpus=bow_corpus_before_orig)
        corpus_tfidf_after_orig = lda_after_orig.tf_idf(
            bow_corpus=bow_corpus_after_orig)

        corpus_tfidf_before_bow = lda_before_bow.tf_idf(
            bow_corpus=bow_corpus_before_bow)
        corpus_tfidf_after_bow = lda_after_bow.tf_idf(
            bow_corpus=bow_corpus_after_bow)

        # store tfidfs
        store_as_pickle(name="corpus_tfidf_before_orig",
                        obj=corpus_tfidf_before_orig)
        store_as_pickle(name="corpus_tfidf_after_orig",
                        obj=corpus_tfidf_after_orig)

        store_as_pickle(name="corpus_tfidf_before_bow",
                        obj=corpus_tfidf_before_bow)
        store_as_pickle(name="corpus_tfidf_after_bow",
                        obj=corpus_tfidf_after_bow)

    elif args.start == "Warm":

        # get before and after docs
        before_docs = get_pickle_object(name="before_docs")
        after_docs = get_pickle_object(name="after_docs")
        print("Docs loaded")

        #get lda objects
        lda_before_orig = get_pickle_object(name="lda_before_orig")
        lda_after_orig = get_pickle_object(name="lda_after_orig")
        lda_before_bow = get_pickle_object(name="lda_before_bow")
        lda_after_bow = get_pickle_object(name="lda_after_bow")
        print("LDAs loaded")

        # get corpus'
        bow_corpus_before_bow = get_pickle_object(name="bow_corpus_before_bow")
        bow_corpus_after_bow = get_pickle_object(name="bow_corpus_after_bow")
        bow_corpus_before_orig = get_pickle_object(
            name="bow_corpus_before_orig")
        bow_corpus_after_orig = get_pickle_object(name="bow_corpus_after_orig")
        print("Corpus' loaded")

        # get tfidfs
        corpus_tfidf_before_orig = get_pickle_object(
            name="corpus_tfidf_before_orig")
        corpus_tfidf_after_orig = get_pickle_object(
            name="corpus_tfidf_after_orig")
        corpus_tfidf_before_bow = get_pickle_object(
            name="corpus_tfidf_before_bow")
        corpus_tfidf_after_bow = get_pickle_object(
            name="corpus_tfidf_after_bow")
        print("TFIDF' loaded\n\n")

        alpha_value = 0.025
        for i in range(1, 4):
            passes = 10
            alpha_value *= 2
            print(f"passes={passes}, alpha: {alpha_value}\n")
            print("--------------Original--------------")
            for no_topics in range(3, 5):
                alpha = [alpha_value] * no_topics
                print("Before")
                before_orig = train_lda(no_topics,
                                        corpus_tfidf_before_orig,
                                        lda_before_orig,
                                        passes=passes,
                                        alpha=alpha)
                topics = before_orig.show_topics(formatted=False)
                plot(model=before_orig,
                     docs=before_docs,
                     type_model="orig",
                     date="before",
                     passes=passes,
                     height=0.02,
                     alpha=alpha)
                # t_SNE(lda_model=before_orig, corpus=corpus_tfidf_before_orig, type_model="orig", topics=topics, date="before", passes=passes)
                print("------------  ------------")
                print("After")
                after_orig = train_lda(no_topics,
                                       corpus_tfidf_after_orig,
                                       lda_after_orig,
                                       passes=passes,
                                       alpha=alpha)
                topics = after_orig.show_topics(formatted=False)
                plot(model=after_orig,
                     docs=after_docs,
                     type_model="orig",
                     date="after",
                     passes=passes,
                     height=0.02,
                     alpha=alpha)
                # t_SNE(lda_model=after_orig, corpus=corpus_tfidf_after_orig, type_model="orig", topics=topics, date="after", passes=passes)
                print("\n\n")

            store_as_pickle(name=f"after-orig_passes={passes}", obj=after_orig)
            store_as_pickle(name=f"before-orig_passes={passes}",
                            obj=before_orig)

            print("--------------BOW--------------")
            for no_topics in range(2, 5):
                alpha = [alpha_value] * no_topics
                print("Before")
                before_bow = train_lda(no_topics,
                                       corpus_tfidf_before_bow,
                                       lda_before_bow,
                                       passes=passes,
                                       alpha=alpha)
                topics = before_bow.show_topics(formatted=False)
                plot(model=before_bow,
                     docs=before_docs,
                     type_model="bow",
                     date="before",
                     passes=passes,
                     alpha=alpha)
                # t_SNE(lda_model=before_bow, corpus=corpus_tfidf_before_bow, type_model="bow", topics=topics, date="before", passes=passes)
                print("------------  ------------")
                print("After")
                after_bow = train_lda(no_topics,
                                      corpus_tfidf_after_bow,
                                      lda_after_bow,
                                      passes=passes,
                                      alpha=alpha)
                topics = after_bow.show_topics(formatted=False)
                plot(model=after_bow,
                     docs=after_docs,
                     type_model="bow",
                     date="after",
                     passes=passes,
                     alpha=alpha)
                # t_SNE(lda_model=after_bow, corpus=corpus_tfidf_after_bow, type_model="bow", topics=topics, date="after", passes=passes)

                print("\n\n")

                store_as_pickle(name=f"after-bow_passes={passes}",
                                obj=after_bow)
                store_as_pickle(name=f"before-bow_passes={passes}",
                                obj=before_bow)
Exemple #30
0
from LDA import LDA
from classifier import Classifier

from tkinter import *
from tkinter import ttk
from tkinter.filedialog import askopenfilename
from tkinter import messagebox

labels = [
    'Chinh tri Xa hoi', 'Doi song', 'Khoa hoc', 'Kinh doanh', 'Phap luat',
    'Suc khoe', 'The gioi', 'The thao', 'Van hoa', 'Vi tinh'
]

lda = LDA()
root = Tk()
classifier = Classifier(type_model='SVM')

Title = root.title("File Opener")
label = ttk.Label(root, text="Text Classification", font=("Helvetica", 16))
label.pack()

#Menu Bar


def OpenFile():
    name = askopenfilename(initialdir="./data/Test_Full/",
                           filetypes=(("Text File", "*.txt"), ("All Files",
                                                               "*.*")),
                           title="Choose a file.")
    topic_vec = lda.cluster(name)
    result = classifier.predict(topic_vec)
Exemple #31
0
def train_lad():
    lda = LDA()
    sentences = ReadFile.readTXTFile(config.BTMData + "btm_text_corpus.txt")
    # line = LineSetence(sentences=sentences)
    lda.buildModel(lines(sentences), num_topics=21)
# shuffle the data & labels
idx = NP.arange(data.shape[0])
NP.random.shuffle(idx)
data = data[idx, ]
labels = labels[idx]

# set number of dimensions in rescaled data
dim_rescale = 3

# {'London': 0, 'Austen': 1, 'Milton': 2, 'Shakespeare': 3}
ndx0 = labels == 0
ndx1 = labels == 1
ndx2 = labels == 2
ndx3 = labels == 3

rescaled_data, w = LDA(data, labels, dim_rescale)

assert NP.sum(ndx0) + NP.sum(ndx1) + NP.sum(ndx2) + NP.sum(
    ndx3) == data.shape[0]

class0 = rescaled_data[ndx0, ]
class1 = rescaled_data[ndx1, ]
class2 = rescaled_data[ndx2, ]
class3 = rescaled_data[ndx3, ]

#----------------------- plotting ----------------------#

x0, y0, z0 = data[:, 0], data[:, 1], data[:, 2]
x1, y1, z1 = class0[:, 0], class0[:, 1], class0[:, 2]
x2, y2, z2 = class1[:, 0], class1[:, 1], class1[:, 2]
x3, y3, z3 = class2[:, 0], class2[:, 1], class2[:, 2]
    start_time = time.time()
    MODEL_BASE_DIR = 'F:/Not_Uploaded/conceptualization_eval/models/'
    EVALUATION_DATASET = '../ratings.txt'
    MODEL_FILE_EXTENSION = '.gensim'
    model_names = [
        'ldamodel_simple_mallet_20_10_keep_300000_gensimstop_topics100',
        'ldamodel_topics100_trainiter20_en_noStopWords',
        'ldamodel_topics100_trainiter20_full_en',
        'ldamodel_topics100_trainiter20_train_en',
    ]
    model_stat = []
    eval_set = get_eval_set(EVALUATION_DATASET)
    for model_name in model_names:
        print("\nTest", model_name, ':')
        lda = LDA()
        lda.load(MODEL_BASE_DIR + model_name + MODEL_FILE_EXTENSION)

        conceptualizer = Conceptualizer(lda)

        none_counter = 0
        estimated_similarities = []
        real_similarities = []

        for element in eval_set:
            estimated_similarity = evaluate(element[1], element[2], element[3],
                                            element[4])
            if estimated_similarity is None:
                none_counter += 1
                continue
            estimated_similarities.append(float(estimated_similarity[0][0]))