def main(): log.info("--------------------------DRY TOPICS------------------------------------") input_list = pd.read_csv(conf.get("MAIN", "path_document"), encoding='utf-8', error_bad_lines=False) input_list = input_list[(input_list.idriferimento_ricerca == 5) | (input_list.idriferimento_ricerca == 6)]['messaggio'].tolist() w2v_model = word2vec_ms.Word2Vec.load(conf.get('MAIN', 'path_pickle_w2v_model')) som_model = load_obj(conf.get('MAIN', 'path_pickle_som_model')) cluster_model = load_obj(conf.get('MAIN', 'path_pickle_codebook_cluster_model')) dried_topics = Topics.doSomAndDryTopics(input_list, w2v_model, som_model, cluster_model) Topics.predictTopics(input_list, w2v_model, som_model, cluster_model, dried_topics)
def doClusteringAndPlot(tweet_rows, file_name): log.info("clustering and plot") # clean input cleaned_tweet_list = clean_text_ms.cleanText(tweet_rows) # get embedded words from input embedded_words_tweets, dict_index2word_tweet, dict_word2index_tweet = word2vec_ms.getEmbeddedWords( cleaned_tweet_list) # load SOM and cluster model mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model')) cluster_model = load_obj(conf.get('MAIN', 'path_pickle_cluster_model')) log.info("SOM model loaded " + conf.get('MAIN', 'path_pickle_som_model')) # mySom.data2unit says for each words in which codebook is contained data2unit, data2cell, data2dist, data2saliency, data2saliency_index, data2maps = mySom.predict( embedded_words_tweets) log.info("fit cluster...") # make clustering data2cluster = cluster_model.predict(embedded_words_tweets) # data2cluster = cluster_model.predict(embedded_words_tweets) # -------------------------OUTPUT print table of clusters------------------------------ path = './data/output/cluster_output.txt' dict_cluster_topic = getTopic(data2cluster, dict_index2word_tweet) printTable(dict_cluster_topic, path) # -------------------------OUTPUT bubble-chart cluster----------------------- codebook2word, codebook2index = getCodebook2Word(data2unit, data2dist, dict_index2word_tweet) dict_cluster2codebook = getCluster2codebook(data2cluster, data2unit) cluster2most_repr_word_index = getCluster2mostRepresentativeWordIndex(dict_cluster2codebook, codebook2index.values()) # dict cluster - most represetative words cluster2most_repr_words = getCluster2mostRepresentativeWords(cluster2most_repr_word_index, dict_index2word_tweet) # dict cluster - mean vector of most representative vectors cluster2mean_vector = getCluster2meanVector(cluster2most_repr_word_index, embedded_words_tweets) cell_frequency = mySom.cellFrequencyDistribution(embedded_words_tweets) # save_obj(data2cluster, "./data2cluster.pickle") # save_obj(codebook2word, "./codebook2word.pickle") # save_obj(dict_word2index_tweet, "./dict_word2index_tweet.pickle") # save_obj(cell_frequency, "./cell_frequency.pickle") url = buildClusterCsv(data2cluster, codebook2word, dict_word2index_tweet, cell_frequency, file_name) # build MST # url = plot_graph.plot_similarity_graph(numpy.array(cluster2mean_vector.values()), # cluster2most_repr_words.values(), file_name, conf, "markers", type_chart) return url
def trainBestSom(w2v_model, new_model=False, identifier=""): # get w2v words, dict words and vectors only for tweet embedded_words_t_w, dict_index2word_t_w, dict_word2indext_w = collectWords(w2v_model) width = int(conf.get('ADVANCED_ASOM', 'width')) height = int(conf.get('ADVANCED_ASOM', 'height')) empty_codebook_threshold = int(conf.getboolean('ADVANCED_ASOM', 'empty_codebook_threshold')) log.info("training som [" + str(width) + "x" + str(height) + "]") mySom = trainSOM(embedded_words_t_w, dict_index2word_t_w, conf, width, height) min_size_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'min_size_codebook_mtx')) step_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'step_codebook_mtx')) # decrease som dimensions if we have more than one codebook empty while (not isGoodResult(mySom, width, height, empty_codebook_threshold) and width > min_size_codebook_mtx + step_codebook_mtx): log.info("training som [" + str(width) + "x" + str(height) + "]") width = height = height - 2 mySom = trainSOM(embedded_words_t_w, dict_index2word_t_w, conf, width, height) if (new_model == False): save_obj(mySom, conf.get('MAIN', 'path_pickle_som_model')) log.info("Model trained") mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model')) else: filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold') + "som_" + str(identifier) + ".pickle" save_obj(mySom, filename)
def getUmatrix(som_model=None): # load SOM if (som_model == None): som_model = load_obj(conf.get('MAIN', 'path_pickle_som_model')) UM, unit_xy = som_model.evaluate_UMatrix() filename = conf.get('MAIN', 'umatrix_filename') plot_graph.plotMatrix(UM, filename) return filename
def getCodebookActivation(som_model=None): # load SOM if (som_model == None): som_model = load_obj(conf.get('MAIN', 'path_pickle_som_model')) som_model.plot_activations() filename = conf.get('MAIN', 'codebook_activation_filename') pylab.savefig(filename) return filename
def main(): log.info("---------------------------CLUSTER CODEBOOK------------------------------------") #-------------------------KMEANS -------------------------------------------------- log.info("START CLUSTERING") mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model')) som_ms.trainCodebookCluster(mySom, new_model=False)
def getCostOfSom(som_model=None): # load SOM if (som_model == None): som_model = load_obj(conf.get('MAIN', 'path_pickle_som_model')) cost = som_model.estimate_cost2(som_model.X[0:10000]) cost = cost * 100 cost = round(cost, 2) cost = str(cost) + " %" log.info("cost: " + cost) return cost
def __init__(self, tweets, w2v_model=None): # load trained model W2V if (w2v_model == None): self.model = Word2Vec.load( conf.get('MAIN', 'path_pickle_w2v_model')) else: self.model = w2v_model self.vec2tweets = {} self.vec2word = {} self.word2tweet = {} self.tweets = tweets self.cleaned_tweets = clean_text_ms.cleanText(tweets) if os.path.exists(conf.get('MAIN', 'path_vec2tweets')): self.vec2tweets = load_obj(conf.get('MAIN', 'path_vec2tweets')) if os.path.exists(conf.get('MAIN', 'path_vec2word')): self.vec2word = load_obj(conf.get('MAIN', 'path_vec2word')) if os.path.exists(conf.get('MAIN', 'path_word2tweet')): self.word2tweet = load_obj(conf.get('MAIN', 'path_word2tweet')) self.embedded_words, self.index2word, self.word2index = getEmbeddedWords( self.cleaned_tweets, w2v_model)
def __init__(self, input_list, w2v_model=None, som_model=None): if (som_model == None): self.som = load_obj(conf.get('MAIN', 'path_pickle_som_model')) else: self.som = som_model self.data2unit = [] self.data2cell = [] self.data2dist = [] self.data2saliency = [] self.data2saliency_index = [] self.data2maps = [] self.codebook2indexes = {} if (som_model == None): self.word2vecMS = Word2VecMS(input_list) else: self.word2vecMS = Word2VecMS(input_list, w2v_model)
def main(): log.info( "----------------------------------START------------------------------------" ) reload(sys) sys.setdefaultencoding('utf-8') document_path_file = conf.get('MAIN', 'path_document') log.info("reading input file: " + document_path_file) # ------------------------READ INPUT------------------------------------------------------- # read csv into list of string input_list = pd.read_csv(document_path_file, encoding='utf-8', error_bad_lines=False) # read which rows are from twitter source_value = np.array(input_list['idriferimento_ricerca']) tweet_rows_bool = (source_value == 5) | (source_value == 6) # read all input input_list = input_list['messaggio'].tolist() # ------------------------CLEANING TEXT--------------------------------------------------- cleaned_input_list = [] # read csv file path_csv_output_folder = conf.get('MAIN', 'path_csv_output_folder') file = path_csv_output_folder + 'cleaned_tweet_list.csv' if (os.path.isfile(file)): log.info("reading input from file " + file) cleaned_input_list = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) cleaned_input_list = cleaned_input_list['colummn'].tolist() if (cleaned_input_list == [] or cleaned_input_list == [[]]): log.info("CLEANING TEXT") cleaned_input_list = clean_text_ms.cleanText(input_list) # write output to csv df = pd.DataFrame(cleaned_input_list, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) # if word2vec does not exist or rebuild is setted train w2v model if not os.path.exists(conf.get('MAIN', 'path_pickle_w2v_model')): #-------------------------GET ENTITIES---------------------------------------------------- log.info("GET ENTITIES") entity_list = [] file_entity_list = path_csv_output_folder + 'entity_list.csv' file = file_entity_list if (os.path.isfile(file)): log.info("reading input from file " + file) # read csv file entity_list = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) entity_list = entity_list['colummn'].tolist() tweet_with_entity_list = [] file_tweet_with_entity_list = path_csv_output_folder + 'tweet_with_entity_list.csv' file = file_tweet_with_entity_list if (os.path.isfile(file)): log.info("reading input from file " + file) tweet_with_entity_list = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) tweet_with_entity_list = tweet_with_entity_list['colummn'].tolist() all_uri = [] file_all_uri = path_csv_output_folder + 'all_uri.csv' file = file_all_uri if (os.path.isfile(file)): log.info("reading input from file " + file) all_uri = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) all_uri = all_uri['colummn'].tolist() # get entities if (entity_list == [] or entity_list == [[]]): confidence = conf.get('ENTITY', 'confidence') entity_list, tweet_with_entity_list, all_uri = Corpus.getEntities( cleaned_input_list, confidence=confidence) file = file_entity_list # write output to csv df = pd.DataFrame(entity_list, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) file = file_tweet_with_entity_list df = pd.DataFrame(tweet_with_entity_list, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) file = file_all_uri df = pd.DataFrame(all_uri, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) with_wiki_pages = conf.get('MAIN', 'with_wiki_pages') if (with_wiki_pages == 'False'): corpus = cleaned_input_list else: #-------------------------GET WIKIPEDIA PAGES--------------------------------------------- log.info("GET WIKIPEDIA PAGES") wikipage_list = [] wikipage_list_file = path_csv_output_folder + 'wikipage_list.csv' file = wikipage_list_file if (os.path.isfile(file)): log.info("reading input from file " + file) # read csv file wikipage_list = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) wikipage_list = wikipage_list['colummn'].tolist() # get wikipedia page if (wikipage_list == [] or wikipage_list == [[]]): wikipage_list = Corpus.getWikipediaPages(all_uri) wikipage_list = clean_text_ms.cleanText(wikipage_list) # write csv df = pd.DataFrame(wikipage_list, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) #-------------------------CREATE CORPUS--------------------------------------------------- print log.info("CREATE CORPUS") corpus = [] # read csv file corpus_file = path_csv_output_folder + 'corpus.csv' file = corpus_file if (os.path.isfile(file)): log.info("reading input from file " + file) # read csv file corpus = pd.read_csv(file, encoding='utf-8', error_bad_lines=False) corpus = corpus['colummn'].tolist() # create corpus if (corpus == [] or wikipage_list == [[]]): tweet_corpus = Corpus.createTweetCorpus( wikipage_list, cleaned_input_list, tweet_with_entity_list) corpus = tweet_corpus if (USE_WIKIPEDIA_FOR_W2V): corpus += wikipage_list corpus_file = path_csv_output_folder + 'corpus.csv' file = corpus_file # write corpus to csv df = pd.DataFrame(corpus, columns=["colummn"]) df.to_csv(file, index=False) log.info("file saved in " + file) #-------------------------TRAIN MODEL W2V------------------------------------------------- # train model W2v log.info("TRAIN W2V") trainW2Vmodel(corpus) #----------------------TRAINING SOM------------------------------------------------ # load trained model W2V w2v_model = Word2Vec.load(conf.get('MAIN', 'path_pickle_w2v_model')) log.info("loading W2V model " + conf.get('MAIN', 'path_pickle_w2v_model')) # get w2v words, dict words and vectors only for tweet embedded_words_t_w, dict_index2word_t_w, dict_word2indext_w = collectWords( w2v_model) # train SOM: get codebook matrix doTrainSom = conf.getboolean('ADVANCED_ASOM', 'do_trainSom') if doTrainSom or not os.path.exists( conf.get('MAIN', 'path_pickle_som_model')): width = int(conf.get('ADVANCED_ASOM', 'width')) height = int(conf.get('ADVANCED_ASOM', 'height')) empty_codebook_threshold = int( conf.getboolean('ADVANCED_ASOM', 'empty_codebook_threshold')) log.info("training som [" + str(width) + "x" + str(height) + "]") mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w, conf, width, height) min_size_codebook_mtx = int( conf.get('ADVANCED_ASOM', 'min_size_codebook_mtx')) step_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'step_codebook_mtx')) # decrease som dimensions if we have more than one codebook empty while (not som_ms.isGoodResult(mySom, width, height, empty_codebook_threshold) and width > min_size_codebook_mtx + step_codebook_mtx): log.info("training som [" + str(width) + "x" + str(height) + "]") width = height = height - 2 mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w, conf, width, height) save_obj(mySom, conf.get('MAIN', 'path_pickle_som_model')) #--------- PREDICT: only on tweets------------------------------------------------ cleaned_input_list = clean_text_ms.cleanText(input_list) # getting only tweets of 3 cleaned_tweet_rows = [] tweet_rows = [] index = 0 for item in tweet_rows_bool: if item == True: cleaned_tweet_rows.append(cleaned_input_list[index]) tweet_rows.append(input_list[index]) index = index + 1 # get embedded words from input embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords( cleaned_tweet_rows) word2VecMS = Word2VecMS(tweet_rows, w2v_model) word2VecMS.computeWord2Tweets() word2VecMS.saveObject() # load SOM mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model')) log.info("loading SOM model " + conf.get('MAIN', 'path_pickle_som_model')) # predict SOM codebooks and plot file_name = conf.get('MAIN', 'MST_html_output_file') url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word, file_name, "netx") file_name = conf.get('MAIN', 'MST_html_d3_output_file') url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word, file_name, "d3") #--------------------PLOT/PRINT INFO ON SOM--------------------------------- png = som_ms.getCodebookActivation() num_of_topic = conf.getint('GRAPH_IMG', 'num_of_topic_for_frequencies') html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model, mySom, num_of_topic, 'bar') html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model, mySom, num_of_topic, 'bubble') png = som_ms.getUmatrix() plt.show() print som_ms.getCostOfSom() #-------------------------KMEANS -------------------------------------------------- if not os.path.exists(conf.get('MAIN', 'path_pickle_cluster_model')): log.info("START CLUSTERING") mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model')) make_figure = False mySom.fit_cluster(cluster_model=None, num_cluster_min=conf.getint('ADVANCED_ASOM', 'num_cluster_min'), num_cluster_max=conf.getint('ADVANCED_ASOM', 'num_cluster_max')) save_obj(mySom.cluster_model, conf.get('MAIN', 'path_pickle_cluster_model')) log.info("saved cluster model in " + conf.get('MAIN', 'path_pickle_cluster_model')) # make clustering and plot file_name = conf.get('MAIN', 'MST_cluster_csv_output_file') url = som_ms.doClusteringAndPlot(cleaned_tweet_rows, file_name)