def prepare_multi_corpora(self, lda_model_1, lda_model_2, corpus_1, corpus_2, id2word_1, id2word_2, matrix_documents_topic_contribution_1, matrix_documents_topic_contribution_2, topic_similarity_matrix): multi_corpora_data = {} if 'data_dict_1' not in multi_corpora_data: data_dict_1 = gensim_helpers.prepare(lda_model_1, corpus_1, id2word_1) multi_corpora_data['data_dict_1'] = data_dict_1 if 'PreparedDataObtained_collection_1' not in multi_corpora_data: temp_1 = prepare(**multi_corpora_data['data_dict_1']) multi_corpora_data[ 'PreparedDataObtained_collection_1'] = temp_1.to_dict() if 'relevantDocumentsDict_collection_1' not in multi_corpora_data: relevantDocumentsDict_collection_1 = matrix_documents_topic_contribution_1.to_dict( orient='records') multi_corpora_data[ 'relevantDocumentsDict_collection_1'] = relevantDocumentsDict_collection_1 if 'data_dict_2' not in multi_corpora_data: data_dict_2 = gensim_helpers.prepare(lda_model_2, corpus_2, id2word_2) multi_corpora_data['data_dict_2'] = data_dict_2 if 'PreparedDataObtained_collection_2' not in multi_corpora_data: temp_2 = prepare(**multi_corpora_data['data_dict_2']) multi_corpora_data[ 'PreparedDataObtained_collection_2'] = temp_2.to_dict() if 'relevantDocumentsDict_collection_2' not in multi_corpora_data: relevantDocumentsDict_collection_2 = matrix_documents_topic_contribution_2.to_dict( orient='records') multi_corpora_data[ 'relevantDocumentsDict_collection_2'] = relevantDocumentsDict_collection_2 multi_corpora_data['lda_model_1'] = lda_model_1, multi_corpora_data['lda_model_2'] = lda_model_2, multi_corpora_data['corpus_1'] = corpus_1 multi_corpora_data['corpus_2'] = corpus_2 multi_corpora_data['id2word_1'] = id2word_1 multi_corpora_data['id2word_2'] = id2word_2 multi_corpora_data['topic_similarity_matrix'] = topic_similarity_matrix multi_corpora_data['topic_order_collection_1'] = multi_corpora_data[ 'PreparedDataObtained_collection_1']['topic.order'] multi_corpora_data['topic_order_collection_2'] = multi_corpora_data[ 'PreparedDataObtained_collection_2']['topic.order']
def prepare_single_corpus(self, lda_model, corpus, id2word, matrix_documents_topic_contribution, topic_similarity_matrix): global single_corpus_datasets ip = request.environ.get("HTTP_X_REAL_IP") single_corpus_data = single_corpus_datasets[ip] if 'data_dict' not in single_corpus_data: data_dict = gensim_helpers.prepare(lda_model, corpus, id2word) single_corpus_data['data_dict'] = data_dict if 'PreparedDataObtained' not in single_corpus_data: print("A NEW PREPARED DATA HA SIDO CREADO!!") temp = prepare(**single_corpus_data['data_dict']) single_corpus_data['PreparedDataObtained'] = temp.to_dict() if 'relevantDocumentsDict' not in single_corpus_data: relevantDocumentsDict = matrix_documents_topic_contribution.to_dict( orient='records') single_corpus_data['relevantDocumentsDict'] = relevantDocumentsDict new_circle_positions = get_circle_positions(topic_similarity_matrix) single_corpus_data['lda_model'] = lda_model single_corpus_data['corpus'] = corpus single_corpus_data['id2word'] = id2word single_corpus_data['topic_similarity_matrix'] = topic_similarity_matrix single_corpus_data['topic.order'] = single_corpus_data[ 'PreparedDataObtained']['topic.order'] single_corpus_data['new_circle_positions'] = new_circle_positions
def calculate_topic_similarity_on_single_corpus( self, word_embedding_model, lda_model, corpus, id2word, matrix_documents_topic_contribution, topn_terms, topk_documents, relevance_lambda): global single_corpus_datasets ip = request.environ.get("HTTP_X_REAL_IP") single_corpus_data = single_corpus_datasets[ip] print("we are calculating a new topic similarity matirx") if 'data_dict' not in single_corpus_data: data_dict = gensim_helpers.prepare(lda_model, corpus, id2word) single_corpus_data['data_dict'] = data_dict if 'PreparedDataObtained' not in single_corpus_data: temp = prepare(**data_dict) single_corpus_data['PreparedDataObtained'] = temp.to_dict() if 'relevantDocumentsDict' not in single_corpus_data: relevantDocumentsDict = matrix_documents_topic_contribution.to_dict( orient='records') single_corpus_data['relevantDocumentsDict'] = relevantDocumentsDict #update word embedding model single_corpus_data['word_embedding_model'] = word_embedding_model #get most relevant keywords sorted by relevance #in merging, we should update this list tinfo_collection_1 = pd.DataFrame.from_dict( single_corpus_data['PreparedDataObtained']['tinfo']) tinfo_collection_1[ 'relevance'] = relevance_lambda * tinfo_collection_1['logprob'] + ( 1.00 - relevance_lambda) * tinfo_collection_1['loglift'] # We need the topkeywords and toprelevantdocuments vectors here!!! topkeywords_vectors_dict_1, relevantdocuments_vectors_dict_1 = get_topkeywords_relevantdocuments_vectors( word_embedding_model, lda_model, pd.DataFrame(single_corpus_data['relevantDocumentsDict']), topn_terms, tinfo_collection_1, topk_documents) #save data single_corpus_data['tinfo_collection'] = tinfo_collection_1 single_corpus_data[ 'topkeywords_vectors_dict'] = topkeywords_vectors_dict_1 single_corpus_data[ 'relevantdocuments_vectors_dict'] = relevantdocuments_vectors_dict_1 return get_dict_topic_similarity_matrix( word_embedding_model, lda_model, matrix_documents_topic_contribution, lda_model, matrix_documents_topic_contribution, topn_terms, single_corpus_data['PreparedDataObtained'], single_corpus_data['PreparedDataObtained'], topk_documents, relevance_lambda, tinfo_collection_1, tinfo_collection_1, topkeywords_vectors_dict_1, topkeywords_vectors_dict_1, relevantdocuments_vectors_dict_1, relevantdocuments_vectors_dict_1)
def calculate_topic_similarity_on_multi_corpora_metric_baseline( self, word_embedding_model, lda_model_1, lda_model_2, corpus_1, corpus_2, id2word_1, id2word_2, relevance_lambda=0.6, topn_terms=20): # get prepared data of lda_model_1, and lda_model_2 data_dict_1 = gensim_helpers.prepare(lda_model_1, corpus_1, id2word_1) prepared_data_topic_1 = prepare(**data_dict_1) #PreparedDataObtained_1_dict = PreparedDataObtained_1.to_dict() # get prepared data of lda_model_1, and lda_model_2 data_dict_2 = gensim_helpers.prepare(lda_model_2, corpus_2, id2word_2) prepared_data_topic_2 = prepare(**data_dict_2) #PreparedDataObtained_2_dict = PreparedDataObtained_2.to_dict() return generar_matrix_baseline_metric(word_embedding_model, prepared_data_topic_1, prepared_data_topic_2, relevance_lambda, topn_terms)
def calculate_topic_similarity_on_multi_corpora( self, word_embedding_model, lda_model_1, lda_model_2, corpus_1, corpus_2, id2word_1, id2word_2, matrix_documents_topic_contribution_1, matrix_documents_topic_contribution_2, topn_terms, topk_documents, relevance_lambda): global multi_corpora_datasets global single_corpus_datasets ip = request.environ.get("HTTP_X_REAL_IP") multi_corpora_data = multi_corpora_datasets[ip] single_corpus_data = single_corpus_datasets[ip] if 'data_dict_1' not in multi_corpora_data: data_dict_1 = gensim_helpers.prepare(lda_model_1, corpus_1, id2word_1) multi_corpora_data['data_dict_1'] = data_dict_1 if 'PreparedDataObtained_collection_1' not in multi_corpora_data: temp_1 = prepare(**data_dict_1) multi_corpora_data[ 'PreparedDataObtained_collection_1'] = temp_1.to_dict() if 'relevantDocumentsDict_collection_1' not in multi_corpora_data: relevantDocumentsDict_collection_1 = matrix_documents_topic_contribution_1.to_dict( orient='records') multi_corpora_data[ 'relevantDocumentsDict_collection_1'] = relevantDocumentsDict_collection_1 if 'data_dict_2' not in multi_corpora_data: data_dict_2 = gensim_helpers.prepare(lda_model_2, corpus_2, id2word_2) multi_corpora_data['data_dict_2'] = data_dict_2 if 'PreparedDataObtained_collection_2' not in multi_corpora_data: temp_2 = prepare(**data_dict_2) multi_corpora_data[ 'PreparedDataObtained_collection_2'] = temp_2.to_dict() if 'relevantDocumentsDict_collection_2' not in multi_corpora_data: relevantDocumentsDict_collection_2 = matrix_documents_topic_contribution_2.to_dict( orient='records') multi_corpora_data[ 'relevantDocumentsDict_collection_2'] = relevantDocumentsDict_collection_2 #get most relevant keywords sorted by relevance #in merging, we should update this list tinfo_collection_1 = pd.DataFrame.from_dict( multi_corpora_data['PreparedDataObtained_collection_1']['tinfo']) tinfo_collection_1[ 'relevance'] = relevance_lambda * tinfo_collection_1['logprob'] + ( 1.00 - relevance_lambda) * tinfo_collection_1['loglift'] tinfo_collection_2 = pd.DataFrame.from_dict( multi_corpora_data['PreparedDataObtained_collection_2']['tinfo']) tinfo_collection_2[ 'relevance'] = relevance_lambda * tinfo_collection_2['logprob'] + ( 1.00 - relevance_lambda) * tinfo_collection_2['loglift'] # We need the topkeywords and toprelevantdocuments vectors here!!! topkeywords_vectors_dict_1, relevantdocuments_vectors_dict_1 = get_topkeywords_relevantdocuments_vectors( word_embedding_model, lda_model_1, pd.DataFrame( multi_corpora_data['relevantDocumentsDict_collection_1']), topn_terms, tinfo_collection_1, topk_documents) topkeywords_vectors_dict_2, relevantdocuments_vectors_dict_2 = get_topkeywords_relevantdocuments_vectors( word_embedding_model, lda_model_2, pd.DataFrame( multi_corpora_data['relevantDocumentsDict_collection_2']), topn_terms, tinfo_collection_2, topk_documents) #save data single_corpus_data['tinfo_collection_1'] = tinfo_collection_1 single_corpus_data['tinfo_collection_2'] = tinfo_collection_2 single_corpus_data[ 'topkeywords_vectors_dict_1'] = topkeywords_vectors_dict_1 single_corpus_data[ 'topkeywords_vectors_dict_2'] = topkeywords_vectors_dict_2 single_corpus_data[ 'relevantdocuments_vectors_dict_1'] = relevantdocuments_vectors_dict_1 single_corpus_data[ 'relevantdocuments_vectors_dict_2'] = relevantdocuments_vectors_dict_2 return get_dict_topic_similarity_matrix( word_embedding_model, lda_model_1, matrix_documents_topic_contribution_1, lda_model_2, matrix_documents_topic_contribution_2, topn_terms, multi_corpora_data['PreparedDataObtained_collection_1'], multi_corpora_data['PreparedDataObtained_collection_2'], topk_documents, relevance_lambda, tinfo_collection_1, tinfo_collection_2, topkeywords_vectors_dict_1, topkeywords_vectors_dict_2, relevantdocuments_vectors_dict_1, relevantdocuments_vectors_dict_2)