Ejemplo n.º 1
0
def main():

    log.info("--------------------------DRY TOPICS------------------------------------")
    input_list = pd.read_csv(conf.get("MAIN", "path_document"), encoding='utf-8', error_bad_lines=False)
    input_list = input_list[(input_list.idriferimento_ricerca == 5) | (input_list.idriferimento_ricerca == 6)]['messaggio'].tolist()

    w2v_model = word2vec_ms.Word2Vec.load(conf.get('MAIN', 'path_pickle_w2v_model'))
    som_model = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
    cluster_model = load_obj(conf.get('MAIN', 'path_pickle_codebook_cluster_model'))

    dried_topics = Topics.doSomAndDryTopics(input_list, w2v_model, som_model, cluster_model)
    Topics.predictTopics(input_list, w2v_model, som_model, cluster_model, dried_topics)
Ejemplo n.º 2
0
def doClusteringAndPlot(tweet_rows, file_name):
    log.info("clustering and plot")
    # clean input
    cleaned_tweet_list = clean_text_ms.cleanText(tweet_rows)

    # get embedded words from input
    embedded_words_tweets, dict_index2word_tweet, dict_word2index_tweet = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list)

    # load SOM and cluster model
    mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
    cluster_model = load_obj(conf.get('MAIN', 'path_pickle_cluster_model'))
    log.info("SOM model loaded " + conf.get('MAIN', 'path_pickle_som_model'))

    # mySom.data2unit says for each words in which codebook is contained
    data2unit, data2cell, data2dist, data2saliency, data2saliency_index, data2maps = mySom.predict(
        embedded_words_tweets)

    log.info("fit cluster...")
    # make clustering
    data2cluster = cluster_model.predict(embedded_words_tweets)
    # data2cluster = cluster_model.predict(embedded_words_tweets)

    # -------------------------OUTPUT print table of clusters------------------------------
    path = './data/output/cluster_output.txt'
    dict_cluster_topic = getTopic(data2cluster, dict_index2word_tweet)
    printTable(dict_cluster_topic, path)

    # -------------------------OUTPUT bubble-chart cluster-----------------------
    codebook2word, codebook2index = getCodebook2Word(data2unit, data2dist, dict_index2word_tweet)
    dict_cluster2codebook = getCluster2codebook(data2cluster, data2unit)
    cluster2most_repr_word_index = getCluster2mostRepresentativeWordIndex(dict_cluster2codebook,
                                                                          codebook2index.values())

    # dict cluster - most represetative words
    cluster2most_repr_words = getCluster2mostRepresentativeWords(cluster2most_repr_word_index, dict_index2word_tweet)
    # dict cluster - mean vector of most representative vectors
    cluster2mean_vector = getCluster2meanVector(cluster2most_repr_word_index, embedded_words_tweets)

    cell_frequency = mySom.cellFrequencyDistribution(embedded_words_tweets)

    # save_obj(data2cluster, "./data2cluster.pickle")
    # save_obj(codebook2word, "./codebook2word.pickle")
    # save_obj(dict_word2index_tweet, "./dict_word2index_tweet.pickle")
    # save_obj(cell_frequency, "./cell_frequency.pickle")
    url = buildClusterCsv(data2cluster, codebook2word, dict_word2index_tweet, cell_frequency, file_name)

    # build MST
    # url = plot_graph.plot_similarity_graph(numpy.array(cluster2mean_vector.values()),
    #                                        cluster2most_repr_words.values(), file_name, conf, "markers", type_chart)
    return url
Ejemplo n.º 3
0
def trainBestSom(w2v_model, new_model=False, identifier=""):
    # get w2v words, dict words and vectors only for tweet
    embedded_words_t_w, dict_index2word_t_w, dict_word2indext_w = collectWords(w2v_model)

    width = int(conf.get('ADVANCED_ASOM', 'width'))
    height = int(conf.get('ADVANCED_ASOM', 'height'))
    empty_codebook_threshold = int(conf.getboolean('ADVANCED_ASOM', 'empty_codebook_threshold'))

    log.info("training som [" + str(width) + "x" + str(height) + "]")
    mySom = trainSOM(embedded_words_t_w, dict_index2word_t_w, conf, width, height)

    min_size_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'min_size_codebook_mtx'))
    step_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'step_codebook_mtx'))

    # decrease som dimensions if we have more than one codebook empty
    while (not isGoodResult(mySom, width, height,
                            empty_codebook_threshold) and width > min_size_codebook_mtx + step_codebook_mtx):
        log.info("training som [" + str(width) + "x" + str(height) + "]")
        width = height = height - 2
        mySom = trainSOM(embedded_words_t_w, dict_index2word_t_w, conf, width, height)

    if (new_model == False):
        save_obj(mySom, conf.get('MAIN', 'path_pickle_som_model'))
        log.info("Model trained")
        mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
    else:
        filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold') + "som_" + str(identifier) + ".pickle"
        save_obj(mySom, filename)
Ejemplo n.º 4
0
def getUmatrix(som_model=None):
    # load SOM
    if (som_model == None):
        som_model = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
    UM, unit_xy = som_model.evaluate_UMatrix()
    filename = conf.get('MAIN', 'umatrix_filename')
    plot_graph.plotMatrix(UM, filename)
    return filename
Ejemplo n.º 5
0
def getCodebookActivation(som_model=None):
    # load SOM
    if (som_model == None):
        som_model = load_obj(conf.get('MAIN', 'path_pickle_som_model'))

    som_model.plot_activations()
    filename = conf.get('MAIN', 'codebook_activation_filename')
    pylab.savefig(filename)
    return filename
Ejemplo n.º 6
0
def main():

    log.info("---------------------------CLUSTER CODEBOOK------------------------------------")

    #-------------------------KMEANS --------------------------------------------------
    log.info("START CLUSTERING")
    mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))

    som_ms.trainCodebookCluster(mySom, new_model=False)
Ejemplo n.º 7
0
def getCostOfSom(som_model=None):
    # load SOM
    if (som_model == None):
        som_model = load_obj(conf.get('MAIN', 'path_pickle_som_model'))

    cost = som_model.estimate_cost2(som_model.X[0:10000])
    cost = cost * 100
    cost = round(cost, 2)
    cost = str(cost) + " %"

    log.info("cost: " + cost)
    return cost
Ejemplo n.º 8
0
    def __init__(self, tweets, w2v_model=None):
        # load trained model W2V
        if (w2v_model == None):
            self.model = Word2Vec.load(
                conf.get('MAIN', 'path_pickle_w2v_model'))
        else:
            self.model = w2v_model

        self.vec2tweets = {}
        self.vec2word = {}
        self.word2tweet = {}

        self.tweets = tweets
        self.cleaned_tweets = clean_text_ms.cleanText(tweets)

        if os.path.exists(conf.get('MAIN', 'path_vec2tweets')):
            self.vec2tweets = load_obj(conf.get('MAIN', 'path_vec2tweets'))
        if os.path.exists(conf.get('MAIN', 'path_vec2word')):
            self.vec2word = load_obj(conf.get('MAIN', 'path_vec2word'))
        if os.path.exists(conf.get('MAIN', 'path_word2tweet')):
            self.word2tweet = load_obj(conf.get('MAIN', 'path_word2tweet'))

        self.embedded_words, self.index2word, self.word2index = getEmbeddedWords(
            self.cleaned_tweets, w2v_model)
Ejemplo n.º 9
0
    def __init__(self, input_list, w2v_model=None, som_model=None):
        if (som_model == None):
            self.som = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
        else:
            self.som = som_model
        self.data2unit = []
        self.data2cell = []
        self.data2dist = []
        self.data2saliency = []
        self.data2saliency_index = []
        self.data2maps = []

        self.codebook2indexes = {}
        if (som_model == None):
            self.word2vecMS = Word2VecMS(input_list)
        else:
            self.word2vecMS = Word2VecMS(input_list, w2v_model)
Ejemplo n.º 10
0
def main():
    log.info(
        "----------------------------------START------------------------------------"
    )
    reload(sys)
    sys.setdefaultencoding('utf-8')

    document_path_file = conf.get('MAIN', 'path_document')
    log.info("reading input file: " + document_path_file)

    # ------------------------READ INPUT-------------------------------------------------------
    # read csv into list of string
    input_list = pd.read_csv(document_path_file,
                             encoding='utf-8',
                             error_bad_lines=False)
    # read which rows are from twitter
    source_value = np.array(input_list['idriferimento_ricerca'])
    tweet_rows_bool = (source_value == 5) | (source_value == 6)
    # read all input
    input_list = input_list['messaggio'].tolist()

    # ------------------------CLEANING TEXT---------------------------------------------------
    cleaned_input_list = []
    # read csv file
    path_csv_output_folder = conf.get('MAIN', 'path_csv_output_folder')
    file = path_csv_output_folder + 'cleaned_tweet_list.csv'
    if (os.path.isfile(file)):
        log.info("reading input from file " + file)
        cleaned_input_list = pd.read_csv(file,
                                         encoding='utf-8',
                                         error_bad_lines=False)
        cleaned_input_list = cleaned_input_list['colummn'].tolist()

    if (cleaned_input_list == [] or cleaned_input_list == [[]]):
        log.info("CLEANING TEXT")
        cleaned_input_list = clean_text_ms.cleanText(input_list)

        # write output to csv
        df = pd.DataFrame(cleaned_input_list, columns=["colummn"])
        df.to_csv(file, index=False)
        log.info("file saved in " + file)

    # if word2vec does not exist or rebuild is setted train w2v model
    if not os.path.exists(conf.get('MAIN', 'path_pickle_w2v_model')):
        #-------------------------GET ENTITIES----------------------------------------------------
        log.info("GET ENTITIES")

        entity_list = []
        file_entity_list = path_csv_output_folder + 'entity_list.csv'
        file = file_entity_list
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            # read csv file
            entity_list = pd.read_csv(file,
                                      encoding='utf-8',
                                      error_bad_lines=False)
            entity_list = entity_list['colummn'].tolist()

        tweet_with_entity_list = []
        file_tweet_with_entity_list = path_csv_output_folder + 'tweet_with_entity_list.csv'
        file = file_tweet_with_entity_list
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            tweet_with_entity_list = pd.read_csv(file,
                                                 encoding='utf-8',
                                                 error_bad_lines=False)
            tweet_with_entity_list = tweet_with_entity_list['colummn'].tolist()

        all_uri = []
        file_all_uri = path_csv_output_folder + 'all_uri.csv'
        file = file_all_uri
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            all_uri = pd.read_csv(file,
                                  encoding='utf-8',
                                  error_bad_lines=False)
            all_uri = all_uri['colummn'].tolist()

        # get entities
        if (entity_list == [] or entity_list == [[]]):
            confidence = conf.get('ENTITY', 'confidence')
            entity_list, tweet_with_entity_list, all_uri = Corpus.getEntities(
                cleaned_input_list, confidence=confidence)

            file = file_entity_list
            # write output to csv
            df = pd.DataFrame(entity_list, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

            file = file_tweet_with_entity_list
            df = pd.DataFrame(tweet_with_entity_list, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

            file = file_all_uri
            df = pd.DataFrame(all_uri, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

        with_wiki_pages = conf.get('MAIN', 'with_wiki_pages')
        if (with_wiki_pages == 'False'):
            corpus = cleaned_input_list
        else:
            #-------------------------GET WIKIPEDIA PAGES---------------------------------------------
            log.info("GET WIKIPEDIA PAGES")
            wikipage_list = []

            wikipage_list_file = path_csv_output_folder + 'wikipage_list.csv'
            file = wikipage_list_file
            if (os.path.isfile(file)):
                log.info("reading input from file " + file)
                # read csv file
                wikipage_list = pd.read_csv(file,
                                            encoding='utf-8',
                                            error_bad_lines=False)
                wikipage_list = wikipage_list['colummn'].tolist()

            # get wikipedia page
            if (wikipage_list == [] or wikipage_list == [[]]):
                wikipage_list = Corpus.getWikipediaPages(all_uri)
                wikipage_list = clean_text_ms.cleanText(wikipage_list)

                # write csv
                df = pd.DataFrame(wikipage_list, columns=["colummn"])
                df.to_csv(file, index=False)
                log.info("file saved in " + file)

            #-------------------------CREATE CORPUS---------------------------------------------------
            print log.info("CREATE CORPUS")
            corpus = []

            # read csv file
            corpus_file = path_csv_output_folder + 'corpus.csv'
            file = corpus_file
            if (os.path.isfile(file)):
                log.info("reading input from file " + file)
                # read csv file
                corpus = pd.read_csv(file,
                                     encoding='utf-8',
                                     error_bad_lines=False)
                corpus = corpus['colummn'].tolist()

            # create corpus
            if (corpus == [] or wikipage_list == [[]]):
                tweet_corpus = Corpus.createTweetCorpus(
                    wikipage_list, cleaned_input_list, tweet_with_entity_list)
                corpus = tweet_corpus
                if (USE_WIKIPEDIA_FOR_W2V):
                    corpus += wikipage_list

        corpus_file = path_csv_output_folder + 'corpus.csv'
        file = corpus_file
        # write corpus to csv
        df = pd.DataFrame(corpus, columns=["colummn"])
        df.to_csv(file, index=False)
        log.info("file saved in " + file)

        #-------------------------TRAIN MODEL W2V-------------------------------------------------
        # train model W2v
        log.info("TRAIN W2V")
        trainW2Vmodel(corpus)

    #----------------------TRAINING SOM------------------------------------------------
    # load trained model W2V
    w2v_model = Word2Vec.load(conf.get('MAIN', 'path_pickle_w2v_model'))
    log.info("loading W2V model " + conf.get('MAIN', 'path_pickle_w2v_model'))

    # get w2v words, dict words and vectors only for tweet
    embedded_words_t_w, dict_index2word_t_w, dict_word2indext_w = collectWords(
        w2v_model)

    # train SOM: get codebook matrix
    doTrainSom = conf.getboolean('ADVANCED_ASOM', 'do_trainSom')
    if doTrainSom or not os.path.exists(
            conf.get('MAIN', 'path_pickle_som_model')):
        width = int(conf.get('ADVANCED_ASOM', 'width'))
        height = int(conf.get('ADVANCED_ASOM', 'height'))
        empty_codebook_threshold = int(
            conf.getboolean('ADVANCED_ASOM', 'empty_codebook_threshold'))

        log.info("training som [" + str(width) + "x" + str(height) + "]")
        mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w, conf,
                                width, height)

        min_size_codebook_mtx = int(
            conf.get('ADVANCED_ASOM', 'min_size_codebook_mtx'))
        step_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'step_codebook_mtx'))

        # decrease som dimensions if we have more than one codebook empty
        while (not som_ms.isGoodResult(mySom, width, height,
                                       empty_codebook_threshold)
               and width > min_size_codebook_mtx + step_codebook_mtx):
            log.info("training som [" + str(width) + "x" + str(height) + "]")
            width = height = height - 2
            mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w,
                                    conf, width, height)

        save_obj(mySom, conf.get('MAIN', 'path_pickle_som_model'))

    #--------- PREDICT: only on tweets------------------------------------------------

    cleaned_input_list = clean_text_ms.cleanText(input_list)

    # getting only tweets of 3
    cleaned_tweet_rows = []
    tweet_rows = []
    index = 0
    for item in tweet_rows_bool:
        if item == True:
            cleaned_tweet_rows.append(cleaned_input_list[index])
            tweet_rows.append(input_list[index])
        index = index + 1

    # get embedded words from input
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_rows)

    word2VecMS = Word2VecMS(tweet_rows, w2v_model)
    word2VecMS.computeWord2Tweets()
    word2VecMS.saveObject()

    # load SOM
    mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
    log.info("loading SOM model " + conf.get('MAIN', 'path_pickle_som_model'))

    # predict SOM codebooks and plot
    file_name = conf.get('MAIN', 'MST_html_output_file')
    url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word,
                              file_name, "netx")
    file_name = conf.get('MAIN', 'MST_html_d3_output_file')
    url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word,
                              file_name, "d3")

    #--------------------PLOT/PRINT INFO ON SOM---------------------------------
    png = som_ms.getCodebookActivation()

    num_of_topic = conf.getint('GRAPH_IMG', 'num_of_topic_for_frequencies')
    html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model,
                                               mySom, num_of_topic, 'bar')
    html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model,
                                               mySom, num_of_topic, 'bubble')

    png = som_ms.getUmatrix()
    plt.show()

    print som_ms.getCostOfSom()

    #-------------------------KMEANS --------------------------------------------------
    if not os.path.exists(conf.get('MAIN', 'path_pickle_cluster_model')):
        log.info("START CLUSTERING")
        mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
        make_figure = False
        mySom.fit_cluster(cluster_model=None,
                          num_cluster_min=conf.getint('ADVANCED_ASOM',
                                                      'num_cluster_min'),
                          num_cluster_max=conf.getint('ADVANCED_ASOM',
                                                      'num_cluster_max'))

        save_obj(mySom.cluster_model,
                 conf.get('MAIN', 'path_pickle_cluster_model'))
        log.info("saved cluster model in " +
                 conf.get('MAIN', 'path_pickle_cluster_model'))

    # make clustering and plot
    file_name = conf.get('MAIN', 'MST_cluster_csv_output_file')
    url = som_ms.doClusteringAndPlot(cleaned_tweet_rows, file_name)