def prepare_multi_corpora(self, lda_model_1, lda_model_2, corpus_1,
                              corpus_2, id2word_1, id2word_2,
                              matrix_documents_topic_contribution_1,
                              matrix_documents_topic_contribution_2,
                              topic_similarity_matrix):
        multi_corpora_data = {}
        if 'data_dict_1' not in multi_corpora_data:
            data_dict_1 = gensim_helpers.prepare(lda_model_1, corpus_1,
                                                 id2word_1)
            multi_corpora_data['data_dict_1'] = data_dict_1
        if 'PreparedDataObtained_collection_1' not in multi_corpora_data:
            temp_1 = prepare(**multi_corpora_data['data_dict_1'])
            multi_corpora_data[
                'PreparedDataObtained_collection_1'] = temp_1.to_dict()

        if 'relevantDocumentsDict_collection_1' not in multi_corpora_data:
            relevantDocumentsDict_collection_1 = matrix_documents_topic_contribution_1.to_dict(
                orient='records')
            multi_corpora_data[
                'relevantDocumentsDict_collection_1'] = relevantDocumentsDict_collection_1

        if 'data_dict_2' not in multi_corpora_data:
            data_dict_2 = gensim_helpers.prepare(lda_model_2, corpus_2,
                                                 id2word_2)
            multi_corpora_data['data_dict_2'] = data_dict_2
        if 'PreparedDataObtained_collection_2' not in multi_corpora_data:
            temp_2 = prepare(**multi_corpora_data['data_dict_2'])
            multi_corpora_data[
                'PreparedDataObtained_collection_2'] = temp_2.to_dict()

        if 'relevantDocumentsDict_collection_2' not in multi_corpora_data:
            relevantDocumentsDict_collection_2 = matrix_documents_topic_contribution_2.to_dict(
                orient='records')
            multi_corpora_data[
                'relevantDocumentsDict_collection_2'] = relevantDocumentsDict_collection_2

        multi_corpora_data['lda_model_1'] = lda_model_1,
        multi_corpora_data['lda_model_2'] = lda_model_2,

        multi_corpora_data['corpus_1'] = corpus_1
        multi_corpora_data['corpus_2'] = corpus_2

        multi_corpora_data['id2word_1'] = id2word_1
        multi_corpora_data['id2word_2'] = id2word_2

        multi_corpora_data['topic_similarity_matrix'] = topic_similarity_matrix

        multi_corpora_data['topic_order_collection_1'] = multi_corpora_data[
            'PreparedDataObtained_collection_1']['topic.order']
        multi_corpora_data['topic_order_collection_2'] = multi_corpora_data[
            'PreparedDataObtained_collection_2']['topic.order']
    def prepare_single_corpus(self, lda_model, corpus, id2word,
                              matrix_documents_topic_contribution,
                              topic_similarity_matrix):
        global single_corpus_datasets
        ip = request.environ.get("HTTP_X_REAL_IP")
        single_corpus_data = single_corpus_datasets[ip]
        if 'data_dict' not in single_corpus_data:
            data_dict = gensim_helpers.prepare(lda_model, corpus, id2word)
            single_corpus_data['data_dict'] = data_dict
        if 'PreparedDataObtained' not in single_corpus_data:
            print("A NEW PREPARED DATA HA SIDO CREADO!!")
            temp = prepare(**single_corpus_data['data_dict'])
            single_corpus_data['PreparedDataObtained'] = temp.to_dict()

        if 'relevantDocumentsDict' not in single_corpus_data:
            relevantDocumentsDict = matrix_documents_topic_contribution.to_dict(
                orient='records')
            single_corpus_data['relevantDocumentsDict'] = relevantDocumentsDict

        new_circle_positions = get_circle_positions(topic_similarity_matrix)

        single_corpus_data['lda_model'] = lda_model
        single_corpus_data['corpus'] = corpus
        single_corpus_data['id2word'] = id2word
        single_corpus_data['topic_similarity_matrix'] = topic_similarity_matrix
        single_corpus_data['topic.order'] = single_corpus_data[
            'PreparedDataObtained']['topic.order']
        single_corpus_data['new_circle_positions'] = new_circle_positions
    def calculate_topic_similarity_on_single_corpus(
            self, word_embedding_model, lda_model, corpus, id2word,
            matrix_documents_topic_contribution, topn_terms, topk_documents,
            relevance_lambda):
        global single_corpus_datasets
        ip = request.environ.get("HTTP_X_REAL_IP")
        single_corpus_data = single_corpus_datasets[ip]
        print("we are calculating a new topic similarity matirx")
        if 'data_dict' not in single_corpus_data:
            data_dict = gensim_helpers.prepare(lda_model, corpus, id2word)
            single_corpus_data['data_dict'] = data_dict
        if 'PreparedDataObtained' not in single_corpus_data:
            temp = prepare(**data_dict)
            single_corpus_data['PreparedDataObtained'] = temp.to_dict()

        if 'relevantDocumentsDict' not in single_corpus_data:
            relevantDocumentsDict = matrix_documents_topic_contribution.to_dict(
                orient='records')
            single_corpus_data['relevantDocumentsDict'] = relevantDocumentsDict

        #update word embedding model
        single_corpus_data['word_embedding_model'] = word_embedding_model

        #get most relevant keywords sorted by relevance
        #in merging, we should update this list
        tinfo_collection_1 = pd.DataFrame.from_dict(
            single_corpus_data['PreparedDataObtained']['tinfo'])
        tinfo_collection_1[
            'relevance'] = relevance_lambda * tinfo_collection_1['logprob'] + (
                1.00 - relevance_lambda) * tinfo_collection_1['loglift']

        # We need the topkeywords and toprelevantdocuments  vectors here!!!
        topkeywords_vectors_dict_1, relevantdocuments_vectors_dict_1 = get_topkeywords_relevantdocuments_vectors(
            word_embedding_model, lda_model,
            pd.DataFrame(single_corpus_data['relevantDocumentsDict']),
            topn_terms, tinfo_collection_1, topk_documents)

        #save data
        single_corpus_data['tinfo_collection'] = tinfo_collection_1
        single_corpus_data[
            'topkeywords_vectors_dict'] = topkeywords_vectors_dict_1
        single_corpus_data[
            'relevantdocuments_vectors_dict'] = relevantdocuments_vectors_dict_1

        return get_dict_topic_similarity_matrix(
            word_embedding_model, lda_model,
            matrix_documents_topic_contribution, lda_model,
            matrix_documents_topic_contribution, topn_terms,
            single_corpus_data['PreparedDataObtained'],
            single_corpus_data['PreparedDataObtained'], topk_documents,
            relevance_lambda, tinfo_collection_1, tinfo_collection_1,
            topkeywords_vectors_dict_1, topkeywords_vectors_dict_1,
            relevantdocuments_vectors_dict_1, relevantdocuments_vectors_dict_1)
    def calculate_topic_similarity_on_multi_corpora_metric_baseline(
            self,
            word_embedding_model,
            lda_model_1,
            lda_model_2,
            corpus_1,
            corpus_2,
            id2word_1,
            id2word_2,
            relevance_lambda=0.6,
            topn_terms=20):
        # get prepared data of lda_model_1, and lda_model_2
        data_dict_1 = gensim_helpers.prepare(lda_model_1, corpus_1, id2word_1)
        prepared_data_topic_1 = prepare(**data_dict_1)
        #PreparedDataObtained_1_dict = PreparedDataObtained_1.to_dict()
        # get prepared data of lda_model_1, and lda_model_2
        data_dict_2 = gensim_helpers.prepare(lda_model_2, corpus_2, id2word_2)
        prepared_data_topic_2 = prepare(**data_dict_2)
        #PreparedDataObtained_2_dict = PreparedDataObtained_2.to_dict()

        return generar_matrix_baseline_metric(word_embedding_model,
                                              prepared_data_topic_1,
                                              prepared_data_topic_2,
                                              relevance_lambda, topn_terms)
    def calculate_topic_similarity_on_multi_corpora(
            self, word_embedding_model, lda_model_1, lda_model_2, corpus_1,
            corpus_2, id2word_1, id2word_2,
            matrix_documents_topic_contribution_1,
            matrix_documents_topic_contribution_2, topn_terms, topk_documents,
            relevance_lambda):
        global multi_corpora_datasets
        global single_corpus_datasets
        ip = request.environ.get("HTTP_X_REAL_IP")
        multi_corpora_data = multi_corpora_datasets[ip]
        single_corpus_data = single_corpus_datasets[ip]
        if 'data_dict_1' not in multi_corpora_data:
            data_dict_1 = gensim_helpers.prepare(lda_model_1, corpus_1,
                                                 id2word_1)
            multi_corpora_data['data_dict_1'] = data_dict_1
        if 'PreparedDataObtained_collection_1' not in multi_corpora_data:
            temp_1 = prepare(**data_dict_1)
            multi_corpora_data[
                'PreparedDataObtained_collection_1'] = temp_1.to_dict()

        if 'relevantDocumentsDict_collection_1' not in multi_corpora_data:
            relevantDocumentsDict_collection_1 = matrix_documents_topic_contribution_1.to_dict(
                orient='records')
            multi_corpora_data[
                'relevantDocumentsDict_collection_1'] = relevantDocumentsDict_collection_1

        if 'data_dict_2' not in multi_corpora_data:
            data_dict_2 = gensim_helpers.prepare(lda_model_2, corpus_2,
                                                 id2word_2)
            multi_corpora_data['data_dict_2'] = data_dict_2
        if 'PreparedDataObtained_collection_2' not in multi_corpora_data:
            temp_2 = prepare(**data_dict_2)
            multi_corpora_data[
                'PreparedDataObtained_collection_2'] = temp_2.to_dict()

        if 'relevantDocumentsDict_collection_2' not in multi_corpora_data:
            relevantDocumentsDict_collection_2 = matrix_documents_topic_contribution_2.to_dict(
                orient='records')
            multi_corpora_data[
                'relevantDocumentsDict_collection_2'] = relevantDocumentsDict_collection_2

        #get most relevant keywords sorted by relevance
        #in merging, we should update this list
        tinfo_collection_1 = pd.DataFrame.from_dict(
            multi_corpora_data['PreparedDataObtained_collection_1']['tinfo'])
        tinfo_collection_1[
            'relevance'] = relevance_lambda * tinfo_collection_1['logprob'] + (
                1.00 - relevance_lambda) * tinfo_collection_1['loglift']

        tinfo_collection_2 = pd.DataFrame.from_dict(
            multi_corpora_data['PreparedDataObtained_collection_2']['tinfo'])
        tinfo_collection_2[
            'relevance'] = relevance_lambda * tinfo_collection_2['logprob'] + (
                1.00 - relevance_lambda) * tinfo_collection_2['loglift']

        # We need the topkeywords and toprelevantdocuments  vectors here!!!
        topkeywords_vectors_dict_1, relevantdocuments_vectors_dict_1 = get_topkeywords_relevantdocuments_vectors(
            word_embedding_model, lda_model_1,
            pd.DataFrame(
                multi_corpora_data['relevantDocumentsDict_collection_1']),
            topn_terms, tinfo_collection_1, topk_documents)
        topkeywords_vectors_dict_2, relevantdocuments_vectors_dict_2 = get_topkeywords_relevantdocuments_vectors(
            word_embedding_model, lda_model_2,
            pd.DataFrame(
                multi_corpora_data['relevantDocumentsDict_collection_2']),
            topn_terms, tinfo_collection_2, topk_documents)

        #save data
        single_corpus_data['tinfo_collection_1'] = tinfo_collection_1
        single_corpus_data['tinfo_collection_2'] = tinfo_collection_2

        single_corpus_data[
            'topkeywords_vectors_dict_1'] = topkeywords_vectors_dict_1
        single_corpus_data[
            'topkeywords_vectors_dict_2'] = topkeywords_vectors_dict_2

        single_corpus_data[
            'relevantdocuments_vectors_dict_1'] = relevantdocuments_vectors_dict_1
        single_corpus_data[
            'relevantdocuments_vectors_dict_2'] = relevantdocuments_vectors_dict_2

        return get_dict_topic_similarity_matrix(
            word_embedding_model, lda_model_1,
            matrix_documents_topic_contribution_1, lda_model_2,
            matrix_documents_topic_contribution_2, topn_terms,
            multi_corpora_data['PreparedDataObtained_collection_1'],
            multi_corpora_data['PreparedDataObtained_collection_2'],
            topk_documents, relevance_lambda, tinfo_collection_1,
            tinfo_collection_2, topkeywords_vectors_dict_1,
            topkeywords_vectors_dict_2, relevantdocuments_vectors_dict_1,
            relevantdocuments_vectors_dict_2)