Exemple #1
0
def trainBestSom(w2v_model, new_model=False, identifier=""):
    # get w2v words, dict words and vectors only for tweet
    embedded_words_t_w, dict_index2word_t_w, dict_word2indext_w = collectWords(w2v_model)

    width = int(conf.get('ADVANCED_ASOM', 'width'))
    height = int(conf.get('ADVANCED_ASOM', 'height'))
    empty_codebook_threshold = int(conf.getboolean('ADVANCED_ASOM', 'empty_codebook_threshold'))

    log.info("training som [" + str(width) + "x" + str(height) + "]")
    mySom = trainSOM(embedded_words_t_w, dict_index2word_t_w, conf, width, height)

    min_size_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'min_size_codebook_mtx'))
    step_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'step_codebook_mtx'))

    # decrease som dimensions if we have more than one codebook empty
    while (not isGoodResult(mySom, width, height,
                            empty_codebook_threshold) and width > min_size_codebook_mtx + step_codebook_mtx):
        log.info("training som [" + str(width) + "x" + str(height) + "]")
        width = height = height - 2
        mySom = trainSOM(embedded_words_t_w, dict_index2word_t_w, conf, width, height)

    if (new_model == False):
        save_obj(mySom, conf.get('MAIN', 'path_pickle_som_model'))
        log.info("Model trained")
        mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
    else:
        filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold') + "som_" + str(identifier) + ".pickle"
        save_obj(mySom, filename)
Exemple #2
0
def trainCodebookCluster(som_model, new_model=False, identifier=""):
    make_figure = False
    cluster_model = som_model.fit_cluster(cluster_model         =   None,
                      perc_subsampling=0.,
                      default_cluster_model=0,
                      num_cluster_min       =   conf.getint('ADVANCED_ASOM', 'num_cluster_min'),
                      num_cluster_max       =   conf.getint('ADVANCED_ASOM', 'num_cluster_max'))

    if (new_model == False):
        save_obj(som_model.cluster_model, conf.get('MAIN', 'path_pickle_codebook_cluster_model'))
        log.info("saved cluster model in " + conf.get('MAIN', 'path_pickle_codebook_cluster_model'))
    else:
        filename = conf.get('MAIN', 'path_pickle_codebook_cluster_model_incr_fold') + "codebook_cluster_" + str(identifier) + ".pickle"
        save_obj(som_model.cluster_model, filename)
        log.info("saved cluster model in " + filename)
def doSomAndDryTopics(input_list, w2v_model, som_model, clustering_model):
    cleaned_tweet_list = clean_text_ms.cleanText(input_list)
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list, w2v_model)

    data2unit, data2cell, data2dist, data2saliency, data2saliency_index, data2maps = som_model.predict(
        embedded_words)

    log.info("fit cluster...")

    codebook2cluster = clustering_model.predict(som_model.W)

    topics = getTopics(som_model, embedded_words, dict_index2word)
    save_obj( stopwordsDictFromFile(conf.ConfigSectionMap('STOPWORDS_FILES')),
                  conf.get('MAIN', 'path_pickle_stopwords_dict'))

    dried_topics = dryTopics(topics, codebook2cluster, embedded_words, dict_word2index, dict_index2word, 1, conf)
    return dried_topics
def trainingBigram(input_list, new_model=False, identifier=""):
    lang = os.environ['LANG']  # 'fr' #'it_core_news_sm'
    bigram_model, _ = genspacyrank.training_ngram(corpus=input_list,
                                                  lang=lang,
                                                  min_count=1,
                                                  threshold=2,
                                                  max_vocab_size=40000000,
                                                  delimiter='_',
                                                  progress_per=10000,
                                                  scoring='default',
                                                  rm_stopwords=True)

    if (new_model == False):
        save_obj(bigram_model, conf.get('MAIN', 'path_pickle_bigram_model'))
    else:
        filename = conf.get('MAIN', 'path_pickle_bigram_model_incr_fold'
                            ) + "bigram_" + str(identifier) + ".pickle"
        save_obj(bigram_model, filename)
        log.info("saved bigram model in " + filename)
def main():
    log.info(
        "----------------------------------START------------------------------------"
    )
    reload(sys)
    sys.setdefaultencoding('utf-8')

    document_path_file = conf.get('MAIN', 'path_document')
    log.info("reading input file: " + document_path_file)

    # ------------------------READ INPUT-------------------------------------------------------
    # read csv into list of string
    input_list = pd.read_csv(document_path_file,
                             encoding='utf-8',
                             error_bad_lines=False)
    # read which rows are from twitter
    source_value = np.array(input_list['idriferimento_ricerca'])
    tweet_rows_bool = (source_value == 5) | (source_value == 6)
    # read all input
    input_list = input_list['messaggio'].tolist()

    # ------------------------CLEANING TEXT---------------------------------------------------
    cleaned_input_list = []
    # read csv file
    path_csv_output_folder = conf.get('MAIN', 'path_csv_output_folder')
    file = path_csv_output_folder + 'cleaned_tweet_list.csv'
    if (os.path.isfile(file)):
        log.info("reading input from file " + file)
        cleaned_input_list = pd.read_csv(file,
                                         encoding='utf-8',
                                         error_bad_lines=False)
        cleaned_input_list = cleaned_input_list['colummn'].tolist()

    if (cleaned_input_list == [] or cleaned_input_list == [[]]):
        log.info("CLEANING TEXT")
        cleaned_input_list = clean_text_ms.cleanText(input_list)

        # write output to csv
        df = pd.DataFrame(cleaned_input_list, columns=["colummn"])
        df.to_csv(file, index=False)
        log.info("file saved in " + file)

    # if word2vec does not exist or rebuild is setted train w2v model
    if not os.path.exists(conf.get('MAIN', 'path_pickle_w2v_model')):
        #-------------------------GET ENTITIES----------------------------------------------------
        log.info("GET ENTITIES")

        entity_list = []
        file_entity_list = path_csv_output_folder + 'entity_list.csv'
        file = file_entity_list
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            # read csv file
            entity_list = pd.read_csv(file,
                                      encoding='utf-8',
                                      error_bad_lines=False)
            entity_list = entity_list['colummn'].tolist()

        tweet_with_entity_list = []
        file_tweet_with_entity_list = path_csv_output_folder + 'tweet_with_entity_list.csv'
        file = file_tweet_with_entity_list
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            tweet_with_entity_list = pd.read_csv(file,
                                                 encoding='utf-8',
                                                 error_bad_lines=False)
            tweet_with_entity_list = tweet_with_entity_list['colummn'].tolist()

        all_uri = []
        file_all_uri = path_csv_output_folder + 'all_uri.csv'
        file = file_all_uri
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            all_uri = pd.read_csv(file,
                                  encoding='utf-8',
                                  error_bad_lines=False)
            all_uri = all_uri['colummn'].tolist()

        # get entities
        if (entity_list == [] or entity_list == [[]]):
            confidence = conf.get('ENTITY', 'confidence')
            entity_list, tweet_with_entity_list, all_uri = Corpus.getEntities(
                cleaned_input_list, confidence=confidence)

            file = file_entity_list
            # write output to csv
            df = pd.DataFrame(entity_list, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

            file = file_tweet_with_entity_list
            df = pd.DataFrame(tweet_with_entity_list, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

            file = file_all_uri
            df = pd.DataFrame(all_uri, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

        with_wiki_pages = conf.get('MAIN', 'with_wiki_pages')
        if (with_wiki_pages == 'False'):
            corpus = cleaned_input_list
        else:
            #-------------------------GET WIKIPEDIA PAGES---------------------------------------------
            log.info("GET WIKIPEDIA PAGES")
            wikipage_list = []

            wikipage_list_file = path_csv_output_folder + 'wikipage_list.csv'
            file = wikipage_list_file
            if (os.path.isfile(file)):
                log.info("reading input from file " + file)
                # read csv file
                wikipage_list = pd.read_csv(file,
                                            encoding='utf-8',
                                            error_bad_lines=False)
                wikipage_list = wikipage_list['colummn'].tolist()

            # get wikipedia page
            if (wikipage_list == [] or wikipage_list == [[]]):
                wikipage_list = Corpus.getWikipediaPages(all_uri)
                wikipage_list = clean_text_ms.cleanText(wikipage_list)

                # write csv
                df = pd.DataFrame(wikipage_list, columns=["colummn"])
                df.to_csv(file, index=False)
                log.info("file saved in " + file)

            #-------------------------CREATE CORPUS---------------------------------------------------
            print log.info("CREATE CORPUS")
            corpus = []

            # read csv file
            corpus_file = path_csv_output_folder + 'corpus.csv'
            file = corpus_file
            if (os.path.isfile(file)):
                log.info("reading input from file " + file)
                # read csv file
                corpus = pd.read_csv(file,
                                     encoding='utf-8',
                                     error_bad_lines=False)
                corpus = corpus['colummn'].tolist()

            # create corpus
            if (corpus == [] or wikipage_list == [[]]):
                tweet_corpus = Corpus.createTweetCorpus(
                    wikipage_list, cleaned_input_list, tweet_with_entity_list)
                corpus = tweet_corpus
                if (USE_WIKIPEDIA_FOR_W2V):
                    corpus += wikipage_list

        corpus_file = path_csv_output_folder + 'corpus.csv'
        file = corpus_file
        # write corpus to csv
        df = pd.DataFrame(corpus, columns=["colummn"])
        df.to_csv(file, index=False)
        log.info("file saved in " + file)

        #-------------------------TRAIN MODEL W2V-------------------------------------------------
        # train model W2v
        log.info("TRAIN W2V")
        trainW2Vmodel(corpus)

    #----------------------TRAINING SOM------------------------------------------------
    # load trained model W2V
    w2v_model = Word2Vec.load(conf.get('MAIN', 'path_pickle_w2v_model'))
    log.info("loading W2V model " + conf.get('MAIN', 'path_pickle_w2v_model'))

    # get w2v words, dict words and vectors only for tweet
    embedded_words_t_w, dict_index2word_t_w, dict_word2indext_w = collectWords(
        w2v_model)

    # train SOM: get codebook matrix
    doTrainSom = conf.getboolean('ADVANCED_ASOM', 'do_trainSom')
    if doTrainSom or not os.path.exists(
            conf.get('MAIN', 'path_pickle_som_model')):
        width = int(conf.get('ADVANCED_ASOM', 'width'))
        height = int(conf.get('ADVANCED_ASOM', 'height'))
        empty_codebook_threshold = int(
            conf.getboolean('ADVANCED_ASOM', 'empty_codebook_threshold'))

        log.info("training som [" + str(width) + "x" + str(height) + "]")
        mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w, conf,
                                width, height)

        min_size_codebook_mtx = int(
            conf.get('ADVANCED_ASOM', 'min_size_codebook_mtx'))
        step_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'step_codebook_mtx'))

        # decrease som dimensions if we have more than one codebook empty
        while (not som_ms.isGoodResult(mySom, width, height,
                                       empty_codebook_threshold)
               and width > min_size_codebook_mtx + step_codebook_mtx):
            log.info("training som [" + str(width) + "x" + str(height) + "]")
            width = height = height - 2
            mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w,
                                    conf, width, height)

        save_obj(mySom, conf.get('MAIN', 'path_pickle_som_model'))

    #--------- PREDICT: only on tweets------------------------------------------------

    cleaned_input_list = clean_text_ms.cleanText(input_list)

    # getting only tweets of 3
    cleaned_tweet_rows = []
    tweet_rows = []
    index = 0
    for item in tweet_rows_bool:
        if item == True:
            cleaned_tweet_rows.append(cleaned_input_list[index])
            tweet_rows.append(input_list[index])
        index = index + 1

    # get embedded words from input
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_rows)

    word2VecMS = Word2VecMS(tweet_rows, w2v_model)
    word2VecMS.computeWord2Tweets()
    word2VecMS.saveObject()

    # load SOM
    mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
    log.info("loading SOM model " + conf.get('MAIN', 'path_pickle_som_model'))

    # predict SOM codebooks and plot
    file_name = conf.get('MAIN', 'MST_html_output_file')
    url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word,
                              file_name, "netx")
    file_name = conf.get('MAIN', 'MST_html_d3_output_file')
    url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word,
                              file_name, "d3")

    #--------------------PLOT/PRINT INFO ON SOM---------------------------------
    png = som_ms.getCodebookActivation()

    num_of_topic = conf.getint('GRAPH_IMG', 'num_of_topic_for_frequencies')
    html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model,
                                               mySom, num_of_topic, 'bar')
    html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model,
                                               mySom, num_of_topic, 'bubble')

    png = som_ms.getUmatrix()
    plt.show()

    print som_ms.getCostOfSom()

    #-------------------------KMEANS --------------------------------------------------
    if not os.path.exists(conf.get('MAIN', 'path_pickle_cluster_model')):
        log.info("START CLUSTERING")
        mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
        make_figure = False
        mySom.fit_cluster(cluster_model=None,
                          num_cluster_min=conf.getint('ADVANCED_ASOM',
                                                      'num_cluster_min'),
                          num_cluster_max=conf.getint('ADVANCED_ASOM',
                                                      'num_cluster_max'))

        save_obj(mySom.cluster_model,
                 conf.get('MAIN', 'path_pickle_cluster_model'))
        log.info("saved cluster model in " +
                 conf.get('MAIN', 'path_pickle_cluster_model'))

    # make clustering and plot
    file_name = conf.get('MAIN', 'MST_cluster_csv_output_file')
    url = som_ms.doClusteringAndPlot(cleaned_tweet_rows, file_name)
 def saveObject(self):
     self.model.save(conf.get('MAIN', 'path_pickle_w2v_model'))
     save_obj(self.vec2tweets, conf.get('MAIN', 'path_vec2tweets'))
     save_obj(self.vec2word, conf.get('MAIN', 'path_vec2word'))
     save_obj(self.word2tweet, conf.get('MAIN', 'path_word2tweet'))