def import_dict_and_normalize(name_database, name_collection, n_documents): print("Getting body subtitles from the database started ...") dbAdapter = dBAdapter.Database(name_database, name_collection) dbAdapter.open() listado = dbAdapter.selectGenerator_normalize_limit(n_documents) dic_subtitles = dbAdapter.selectDic_subtitles_limit(n_documents) dbAdapter.close() print("finalizada consulta") dic_subtitles2 = dic_subtitles generator_normalize = [] for i in range(len(listado)): try: generator_normalize.append(listado[i].split(",")) except: dic_subtitles2.pop(list(dic_subtitles.keys())[i]) print("generator NonType------>" + str(i)) dic_subtitles = dic_subtitles2 for gn in generator_normalize: while True: try: gn.remove("") except ValueError: break print("Getting body subtitles from the database finished ...") n_documents = len(generator_normalize) return dic_subtitles, generator_normalize, n_documents
def update_doc2vec(): #------------------------------------------------------ #UPDATE DDBB DOC2VEC #------------------------------------------------------ [files, max_documents] = g.get_NameFiles() [dic_subtitles, data] = c.create_d2v_corpus(max_documents) subtitles = list(dic_subtitles.keys()) data_s = [] for d in data: data_s.append(','.join(d)) print("updating the database") dbAdapter = dBAdapter.Database('tfg_project', 'tv_storage') dbAdapter.open() for i in range(len(data_s)): dbAdapter.update_doc2vec(subtitles[i], data_s[i]) dbAdapter.close()
def import_doc2vec_list(name_database, name_collection, n_documents): dbAdapter = dBAdapter.Database(name_database, name_collection) dbAdapter.open() print("Getting doc2vec list started...") list_s = dbAdapter.select_dataDoc2Vec(n_documents) print("Getting doc2vec list finished...") dbAdapter.close() data = [] for l in list_s: data.append(l.split(",")) for d in data: while True: try: d.remove("") d.remove(" ") except ValueError: break return data, n_documents
def doc2vec_module(database, collection, n_documents=300, vector_size=50, max_clusters=200): #logs file_logs = config['LOGS']['doc2vec_logs'] name_log_file = datetime.now().strftime(file_logs + '_%d_%m_%Y.log') logging.basicConfig( filename=name_log_file, level=logging.WARNING, format="%(asctime)s:%(filename)s:%(lineno)d:%(levelname)s:%(message)s") #end config variables----------------------------------------------------------------------- #import from DDBB dic_subtitles and data doc2vec -------------------------- print("Getting body subtitles from the database started ...") data = [] dbAdapter = dBAdapter.Database(database, collection) dbAdapter.open() dic_subtitles = dbAdapter.selectDic_subtitles_limit(n_documents) subtitles = list(dic_subtitles.keys()) list_s = dbAdapter.select_dataDoc2Vec(n_documents) print("Getting body subtitles from the database finished ...") data = [] for l in list_s: data.append(l.split(",")) for d in data: while True: try: d.remove("") d.remove(" ") except ValueError: break #-------------------------------------------------------------------------- # Create the tagged document needed for Doc2Vec def create_tagged_document(list_of_list_of_words): for i, list_of_words in enumerate(list_of_list_of_words): yield gensim.models.doc2vec.TaggedDocument(list_of_words, [subtitles[i]]) train_data = list(create_tagged_document(data)) print("starting with doc2vec....") model = gensim.models.doc2vec.Doc2Vec(vector_size=vector_size, min_count=2, epochs=40) # Build the Volabulary model.build_vocab(train_data) # Train the Doc2Vec model model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs) list_vec_doc2vec = [model.docvecs[subtitle] for subtitle in subtitles] arr_vec_doc2vec = np.stack(list_vec_doc2vec, axis=0) return list_vec_doc2vec, arr_vec_doc2vec, train_data, model
@author: cvicentm """ """IN THIS PROGRAM THE CODE DOC2VEC WILL BE EXECUTED """ from modules.doc2vec import doc2vec as d2v from modules.classificator import k_means_doc2vec as k from modules.sql import dBAdapter import matplotlib.pyplot as plt import timeit import numpy as np import pickle database = 'tfg_project' collection = 'tv_storage' dbAdapter = dBAdapter.Database(database, collection) dbAdapter.open() max_documents = dbAdapter.get_maxDocuments() dbAdapter.close() max_clusters = 20 n_documents = 200 [list_vec_doc2vec, arr_vec_doc2vec, train_data, model] = d2v.doc2vec_module(database, collection, n_documents=n_documents, vector_size=50, max_clusters=max_clusters) #Para saber las palabras más parecidas con el modelo DM
# -*- coding: utf-8 -*- """ Created on Sun Jun 14 11:44:45 2020 @author: cvicentm """ #------------------------------------------------------ #GET DDBB update channels into database #------------------------------------------------------ from modules.sql import dBAdapter dbAdapter = dBAdapter.Database() dbAdapter.open() dic_subtitles = dict(dbAdapter.selectAll()) dbAdapter.close() print("finalizada consulta") import modules.variables as v channels = v.CHANNELS channel_column = [(subtitle, channel) for subtitle in list(dic_subtitles.keys()) for channel in channels if subtitle.find(channel) != -1] from modules.sql import dBAdapter dbAdapter = dBAdapter.Database() dbAdapter.open() for ch in channel_column: dbAdapter.update_channel(ch[0], ch[1]) dbAdapter.close() print("finalizada consulta")
Created on Wed Jul 8 18:25:36 2020 @author: cvicentm """ # pip install pymongo from pymongo import MongoClient from modules.sql import dBAdapter import pandas as pd import json from tqdm import tqdm import logging from modules.sql import dBAdapter dbAdapter = dBAdapter.Database('tfg_project','tv_storage') dbAdapter.open() result = list(dbAdapter.selectDict()) result2 = dbAdapter.get_maxDocuments() result3 = list(dbAdapter.selectRowByName('antena3_2019 09 14_morning_new')) result4 = list(dbAdapter.selectDic_subtitles_limit(10)) result5 = list(dbAdapter.select_dataDoc2Vec(40)) dbAdapter.update_doc2vec("1_spa_2019 07 21_morning_new",'hola') dbAdapter.close() def mongo_q_doc2vec_to_list(mongo_q_doc2vec): result = [] for mq in mongo_q_doc2vec: result.append(mq['doc2vec']) return result list5=mongo_q_doc2vec_to_list(result5)
def max_documents(name_database, name_collection): dbAdapter = dBAdapter.Database(name_database, name_collection) dbAdapter.open() max_documents = dbAdapter.get_maxDocuments() dbAdapter.close() return max_documents
def LDAmodel(n_topics, n_documents, n_printedDocuments, name_database, name_collection, step=1, start=1): #Tengo que escribir para que sirve cada cosa que hace el gensim #import from DDBB dic_subtitles and generator normalize-------------------- """ print("Getting body subtitles from the database started ...") dbAdapter= dBAdapter.Database() dbAdapter.open() dic_subtitles = dict(dbAdapter.selectDic_subtitles_limit(n_documents)) gn = dbAdapter.selectGenerator_normalize_limit(n_documents) generator_normalize = [ast.literal_eval(gni[0]) for gni in gn] dbAdapter.close() print("Getting body subtitles from the database finished ...") """ print("Getting body subtitles from the database started ...") dbAdapter = dBAdapter.Database(name_database, name_collection) dbAdapter.open() listado = dbAdapter.selectGenerator_normalize_limit(n_documents) dic_subtitles = dbAdapter.selectDic_subtitles_limit(n_documents) dbAdapter.close() print("finalizada consulta") dic_subtitles2 = dic_subtitles generator_normalize = [] for i in range(len(listado)): try: generator_normalize.append(listado[i].split(",")) except: dic_subtitles2.pop(list(dic_subtitles.keys())[i]) print("generator NonType------>" + str(i)) dic_subtitles = dic_subtitles2 for gn in generator_normalize: while True: try: gn.remove("") except ValueError: break print("Getting body subtitles from the database finished ...") n_documents = len(generator_normalize) #-------------------------------------------------------------------------- coherencemodelArray = [] if not os.path.exists('D:\\caleb\\pickle\\' + str(n_documents)): os.makedirs('D:\\caleb\\pickle\\' + str(n_documents)) try: id2word = pickle.load( open( "D:\\caleb\\pickle\\" + str(n_documents) + "\id2word_" + str(n_documents) + ".txt", "rb")) corpus = pickle.load( open( "D:\\caleb\\pickle\\" + str(n_documents) + "\corpus_" + str(n_documents) + ".txt", "rb")) print("generator_normalize, id2word and corpus has been imported") except IOError: print("Proccess of creating corpus and the dictionary has started") #this is creating a dictionary with all de different words of the document id2word = corpora.Dictionary(generator_normalize) file_id2word = "D:\\caleb\\pickle\\" + str( n_documents) + "\id2word_" + str(n_documents) + '.txt' pickle.dump(id2word, open(file_id2word, 'wb')) # Create Corpus: Term Document Frequency corpus = [id2word.doc2bow(text) for text in generator_normalize] file_corpus = "D:\\caleb\\pickle\\" + str( n_documents) + "\corpus_" + str(n_documents) + '.txt' pickle.dump(corpus, open(file_corpus, 'wb')) print("Proccess of creating corpus and the dictionary has ended") for n_topics in chain(range(1, 2), range(2, 18, 2), range(18, 200, 8)): file_lda_model = 'D:\\caleb\\pickle\\' + str( n_documents) + '\lda_model_' + str(n_topics) + '_' + str( n_documents) + '.sav' try: f = open(file_lda_model, 'rb') lda = pickle.load(f) print("The model has been trained previously with..." + str(n_topics) + " n_topics") coherencemodel = CoherenceModel(model=lda, corpus=corpus, dictionary=id2word, coherence='u_mass') coherencemodel_cv = CoherenceModel(model=lda, texts=list(generator_normalize), dictionary=id2word, coherence='c_v') coherencemodel_c_uci = CoherenceModel( model=lda, texts=list(generator_normalize), dictionary=id2word, coherence='c_uci') file_coherence_cv = 'D:\\caleb\\pickle\\' + str( n_documents) + '\cv_' + str(n_topics) + '_' + str( n_documents) + '.sav' pickle.dump(coherencemodel_cv, open(file_coherence_cv, 'wb')) file_coherence_c_uci = 'D:\\caleb\\pickle\\' + str( n_documents) + '\c_uci_' + str(n_topics) + '_' + str( n_documents) + '.sav' pickle.dump(coherencemodel_c_uci, open(file_coherence_c_uci, 'wb')) #CoherenceModel(model=goodLdaModel, texts=texts, dictionary=dictionary, coherence='c_v') #coherencemodel = CoherenceModel(model=lda, texts=list(generator_normalize), dictionary=id2word, coherence='c_v') coherence_values = coherencemodel.get_coherence() coherencemodelArray.append(coherence_values) except IOError: print("FINALLY: the LDA model has to be trained for " + str(n_documents) + " n_documents and " + str(n_topics) + " n_topics, trained") tic_all_processing = timeit.default_timer() #function based on : https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#13viewthetopicsinldamodel [coherence_values, coherencemodel_cv, coherencemodel_c_uci] = training_model(n_documents, n_topics, id2word, corpus, generator_normalize) coherencemodelArray.append(coherence_values) toc_all_processing = timeit.default_timer() try: time_lda_fit = str( datetime.timedelta(seconds=int( float(toc_all_processing - tic_all_processing)))) print("The process of training lda model with " + str(n_topics) + " n_topics and " + str(n_documents) + " n_documents, has taken " + time_lda_fit + " seconds") except AttributeError: print("The process of training lda model with " + str(n_topics) + " n_topics and " + str(n_documents) + " n_documents, has ended") file_coherence_cv = 'D:\\caleb\\pickle\\' + str( n_documents) + '\cv_' + str(n_topics) + '_' + str( n_documents) + '.sav' pickle.dump(coherencemodel_cv, open(file_coherence_cv, 'wb')) file_coherence_c_uci = 'D:\\caleb\\pickle\\' + str( n_documents) + '\c_uci_' + str(n_topics) + '_' + str( n_documents) + '.sav' pickle.dump(coherencemodel_c_uci, open(file_coherence_c_uci, 'wb')) coherencemodelArray = list(coherencemodelArray) file_coherence_umass = 'D:\\caleb\\pickle\\coherencemodelarray.sav' pickle.dump(coherencemodelArray, open(file_coherence_umass, 'wb')) x = list(chain(range(1, 2), range(2, 18, 2), range(18, 200, 8))) #n_topics+1 because has to have the same weight than coherencemodelArray score = savgol_filter(coherencemodelArray, 11, 3) plt.plot(x, score) plt.xlabel("N_Topics") plt.ylabel("Coherence") plt.legend(("coherence_values"), loc='best') plt.show() best_n_topic = coherencemodelArray.index(min(coherencemodelArray)) + start print("el mejor modelo es: " + 'pickle' + str(n_documents) + '\lda_model_' + str(best_n_topic) + '_' + str(n_documents) + '.sav') f = open( 'D:\\caleb\\pickle\\' + str(n_documents) + '\lda_model_' + str(best_n_topic) + '_' + str(n_documents) + '.sav', 'rb') lda = pickle.load(f) document_per_topic = list(lda.get_document_topics(corpus)) """ corp_cur = corpus[1] topic_percs, wordid_topics, wordid_phivalues = lda[corp_cur] print(wordid_topics) """ array_topic_per_document = np.zeros( (len(document_per_topic), best_n_topic)) for i in range(len(document_per_topic)): for j in range(len(document_per_topic[i])): try: array_topic_per_document[i][document_per_topic[i][j] [0]] = document_per_topic[i][j][1] except IndexError as index: #EN ESTE LOG sería necesario ponerle, cual ha sido el subtítulo que ha dado problemas e identifcar porque logging.warning( "array_topic_per_document out of range in position n_document: " + str(i) + " and topic: " + str(j) + " \n") #NUMBER OF DOCUMENTs to print results on word return array_topic_per_document, best_n_topic, dic_subtitles, lda, generator_normalize, corpus, id2word, coherencemodelArray
#resumen de topicos---------------------------------------------------- """IN THIS CODE WE WILL EXECUTE THE CODE RELATED TO LDA""" start_topics = 1 N_TOPICS = 2 step = 2 #este parámetro no se puede añadir a mano n_printedDocuments = 20 max_clusters = 200 from modules.sql import dBAdapter name_database = 'tfg_project' name_collection = 'tv_storage' dbAdapter = dBAdapter.Database(name_database, name_collection) dbAdapter.open() max_documents = dbAdapter.get_maxDocuments() dbAdapter.close() #if we want to change the number of documents to analized we can do it here n_documents = max_documents #PROGRAM----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- [ array_topic_per_document, best_n_topic, dic_subtitles, lda, generator_normalize, corpus, id2word, coherencemodelArray ] = LDAmodel(N_TOPICS, n_documents, n_printedDocuments,