Beispiel #1
0
def predictTopics(input_list,
                  w2v_model,
                  som_model,
                  cluster_model,
                  dried_topics,
                  type_chart="d3"):
    codebook2cluster = cluster_model.predict(som_model.W)

    cleaned_tweet_list = clean_text_ms.cleanText(input_list)
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list, w2v_model)

    graphs = []
    for index in xrange(codebook2cluster.max() + 1):
        M, words_list = getDriedTopicMatrix(index, dried_topics,
                                            embedded_words, dict_word2index)
        if len(words_list) > 10:
            # file_name_index = './data/output/dried_' + str(index) + '.json'
            file_name = conf.get('MAIN', 'MST_dried_topics_d3_base_file')
            file_name_index = file_name + str(index) + '.html'
            graph = plot_graph.plot_similarity_graph(M, words_list,
                                                     file_name_index,
                                                     type_chart)
            graphs.append(graph)
            print file_name_index

    return graphs
Beispiel #2
0
def getCellFrequencyDistribution(tweet_rows, w2v_model, mySom, num_of_topic=0, type_chart='bar'):
    cleaned_tweet_list = clean_text_ms.cleanText(tweet_rows)

    # get embedded words from input
    embedded_words_tweets, dict_index2word_tweet, dict_word2index_tweet = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list, w2v_model)

    # predict
    data2unit, data2cell, data2dist, data2saliency, data2saliency_index, data2maps = mySom.predict(
        embedded_words_tweets)

    cell_frequency = mySom.cellFrequencyDistribution(embedded_words_tweets)

    codebook2word, codebook2index = getCodebook2Word(data2unit, data2dist, dict_index2word_tweet)

    if (type_chart == 'bubble'):
        file_name = conf.get('MAIN', 'topic_frequencies_file_bubble')
    else:
        file_name = conf.get('MAIN', 'topic_frequencies_file_bar')

    # adding empty word for empty codebook
    for i in range(0, mySom.width * mySom.width):
        if i not in codebook2word.keys():
            codebook2word[i] = ''
    codebook2word = OrderedDict(sorted(codebook2word.items()))

    url = plot_graph.plotFrequencyGraph(cell_frequency, codebook2word.values(), file_name, num_of_topic, type_chart)

    log.info("Frequency bubble chart in " + file_name)

    f = codecs.open(file_name, 'r')
    html = f.read()
    return html
Beispiel #3
0
def getSomWithPrediction(input_list, w2v_model=None, som_model=None):
    # clean input
    cleaned_input_list = clean_text_ms.cleanText(input_list)

    # get embedded words from input
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_input_list, w2v_model)
    som_MS = SOM_MS(input_list, w2v_model, som_model)
    som_MS.predict(embedded_words)
    return som_MS
Beispiel #4
0
def doClusteringAndPlot(tweet_rows, file_name):
    log.info("clustering and plot")
    # clean input
    cleaned_tweet_list = clean_text_ms.cleanText(tweet_rows)

    # get embedded words from input
    embedded_words_tweets, dict_index2word_tweet, dict_word2index_tweet = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list)

    # load SOM and cluster model
    mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
    cluster_model = load_obj(conf.get('MAIN', 'path_pickle_cluster_model'))
    log.info("SOM model loaded " + conf.get('MAIN', 'path_pickle_som_model'))

    # mySom.data2unit says for each words in which codebook is contained
    data2unit, data2cell, data2dist, data2saliency, data2saliency_index, data2maps = mySom.predict(
        embedded_words_tweets)

    log.info("fit cluster...")
    # make clustering
    data2cluster = cluster_model.predict(embedded_words_tweets)
    # data2cluster = cluster_model.predict(embedded_words_tweets)

    # -------------------------OUTPUT print table of clusters------------------------------
    path = './data/output/cluster_output.txt'
    dict_cluster_topic = getTopic(data2cluster, dict_index2word_tweet)
    printTable(dict_cluster_topic, path)

    # -------------------------OUTPUT bubble-chart cluster-----------------------
    codebook2word, codebook2index = getCodebook2Word(data2unit, data2dist, dict_index2word_tweet)
    dict_cluster2codebook = getCluster2codebook(data2cluster, data2unit)
    cluster2most_repr_word_index = getCluster2mostRepresentativeWordIndex(dict_cluster2codebook,
                                                                          codebook2index.values())

    # dict cluster - most represetative words
    cluster2most_repr_words = getCluster2mostRepresentativeWords(cluster2most_repr_word_index, dict_index2word_tweet)
    # dict cluster - mean vector of most representative vectors
    cluster2mean_vector = getCluster2meanVector(cluster2most_repr_word_index, embedded_words_tweets)

    cell_frequency = mySom.cellFrequencyDistribution(embedded_words_tweets)

    # save_obj(data2cluster, "./data2cluster.pickle")
    # save_obj(codebook2word, "./codebook2word.pickle")
    # save_obj(dict_word2index_tweet, "./dict_word2index_tweet.pickle")
    # save_obj(cell_frequency, "./cell_frequency.pickle")
    url = buildClusterCsv(data2cluster, codebook2word, dict_word2index_tweet, cell_frequency, file_name)

    # build MST
    # url = plot_graph.plot_similarity_graph(numpy.array(cluster2mean_vector.values()),
    #                                        cluster2most_repr_words.values(), file_name, conf, "markers", type_chart)
    return url
Beispiel #5
0
def getCodebookWords(tweet_rows, w2v_model, som_model):
    log.info("predict and plot")
    cleaned_tweet_list = clean_text_ms.cleanText(tweet_rows)

    # get embedded words from input
    embedded_words_tweets, dict_index2word_tweet, dict_word2index_tweet = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list, w2v_model)

    # mySom.data2unit says for each words in which codebook is contained
    data2unit, data2cell, data2dist, data2saliency, data2saliency_index, data2maps = som_model.predict(
        embedded_words_tweets)

    dict_codebook_topic = getTopic(data2unit, dict_index2word_tweet)
    return dict_codebook_topic
Beispiel #6
0
def doSomAndDryTopics(input_list, w2v_model, som_model, clustering_model):
    cleaned_tweet_list = clean_text_ms.cleanText(input_list)
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list, w2v_model)

    data2unit, data2cell, data2dist, data2saliency, data2saliency_index, data2maps = som_model.predict(
        embedded_words)

    log.info("fit cluster...")

    codebook2cluster = clustering_model.predict(som_model.W)

    topics = getTopics(som_model, embedded_words, dict_index2word)
    save_obj( stopwordsDictFromFile(conf.ConfigSectionMap('STOPWORDS_FILES')),
                  conf.get('MAIN', 'path_pickle_stopwords_dict'))

    dried_topics = dryTopics(topics, codebook2cluster, embedded_words, dict_word2index, dict_index2word, 1, conf)
    return dried_topics
def main():
    log.info(
        "----------------------------------START------------------------------------"
    )
    reload(sys)
    sys.setdefaultencoding('utf-8')

    document_path_file = conf.get('MAIN', 'path_document')
    log.info("reading input file: " + document_path_file)

    # ------------------------READ INPUT-------------------------------------------------------
    # read csv into list of string
    input_list = pd.read_csv(document_path_file,
                             encoding='utf-8',
                             error_bad_lines=False)
    # read which rows are from twitter
    source_value = np.array(input_list['idriferimento_ricerca'])
    tweet_rows_bool = (source_value == 5) | (source_value == 6)
    # read all input
    input_list = input_list['messaggio'].tolist()

    # ------------------------CLEANING TEXT---------------------------------------------------
    cleaned_input_list = []
    # read csv file
    path_csv_output_folder = conf.get('MAIN', 'path_csv_output_folder')
    file = path_csv_output_folder + 'cleaned_tweet_list.csv'
    if (os.path.isfile(file)):
        log.info("reading input from file " + file)
        cleaned_input_list = pd.read_csv(file,
                                         encoding='utf-8',
                                         error_bad_lines=False)
        cleaned_input_list = cleaned_input_list['colummn'].tolist()

    if (cleaned_input_list == [] or cleaned_input_list == [[]]):
        log.info("CLEANING TEXT")
        cleaned_input_list = clean_text_ms.cleanText(input_list)

        # write output to csv
        df = pd.DataFrame(cleaned_input_list, columns=["colummn"])
        df.to_csv(file, index=False)
        log.info("file saved in " + file)

    # if word2vec does not exist or rebuild is setted train w2v model
    if not os.path.exists(conf.get('MAIN', 'path_pickle_w2v_model')):
        #-------------------------GET ENTITIES----------------------------------------------------
        log.info("GET ENTITIES")

        entity_list = []
        file_entity_list = path_csv_output_folder + 'entity_list.csv'
        file = file_entity_list
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            # read csv file
            entity_list = pd.read_csv(file,
                                      encoding='utf-8',
                                      error_bad_lines=False)
            entity_list = entity_list['colummn'].tolist()

        tweet_with_entity_list = []
        file_tweet_with_entity_list = path_csv_output_folder + 'tweet_with_entity_list.csv'
        file = file_tweet_with_entity_list
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            tweet_with_entity_list = pd.read_csv(file,
                                                 encoding='utf-8',
                                                 error_bad_lines=False)
            tweet_with_entity_list = tweet_with_entity_list['colummn'].tolist()

        all_uri = []
        file_all_uri = path_csv_output_folder + 'all_uri.csv'
        file = file_all_uri
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            all_uri = pd.read_csv(file,
                                  encoding='utf-8',
                                  error_bad_lines=False)
            all_uri = all_uri['colummn'].tolist()

        # get entities
        if (entity_list == [] or entity_list == [[]]):
            confidence = conf.get('ENTITY', 'confidence')
            entity_list, tweet_with_entity_list, all_uri = Corpus.getEntities(
                cleaned_input_list, confidence=confidence)

            file = file_entity_list
            # write output to csv
            df = pd.DataFrame(entity_list, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

            file = file_tweet_with_entity_list
            df = pd.DataFrame(tweet_with_entity_list, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

            file = file_all_uri
            df = pd.DataFrame(all_uri, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

        with_wiki_pages = conf.get('MAIN', 'with_wiki_pages')
        if (with_wiki_pages == 'False'):
            corpus = cleaned_input_list
        else:
            #-------------------------GET WIKIPEDIA PAGES---------------------------------------------
            log.info("GET WIKIPEDIA PAGES")
            wikipage_list = []

            wikipage_list_file = path_csv_output_folder + 'wikipage_list.csv'
            file = wikipage_list_file
            if (os.path.isfile(file)):
                log.info("reading input from file " + file)
                # read csv file
                wikipage_list = pd.read_csv(file,
                                            encoding='utf-8',
                                            error_bad_lines=False)
                wikipage_list = wikipage_list['colummn'].tolist()

            # get wikipedia page
            if (wikipage_list == [] or wikipage_list == [[]]):
                wikipage_list = Corpus.getWikipediaPages(all_uri)
                wikipage_list = clean_text_ms.cleanText(wikipage_list)

                # write csv
                df = pd.DataFrame(wikipage_list, columns=["colummn"])
                df.to_csv(file, index=False)
                log.info("file saved in " + file)

            #-------------------------CREATE CORPUS---------------------------------------------------
            print log.info("CREATE CORPUS")
            corpus = []

            # read csv file
            corpus_file = path_csv_output_folder + 'corpus.csv'
            file = corpus_file
            if (os.path.isfile(file)):
                log.info("reading input from file " + file)
                # read csv file
                corpus = pd.read_csv(file,
                                     encoding='utf-8',
                                     error_bad_lines=False)
                corpus = corpus['colummn'].tolist()

            # create corpus
            if (corpus == [] or wikipage_list == [[]]):
                tweet_corpus = Corpus.createTweetCorpus(
                    wikipage_list, cleaned_input_list, tweet_with_entity_list)
                corpus = tweet_corpus
                if (USE_WIKIPEDIA_FOR_W2V):
                    corpus += wikipage_list

        corpus_file = path_csv_output_folder + 'corpus.csv'
        file = corpus_file
        # write corpus to csv
        df = pd.DataFrame(corpus, columns=["colummn"])
        df.to_csv(file, index=False)
        log.info("file saved in " + file)

        #-------------------------TRAIN MODEL W2V-------------------------------------------------
        # train model W2v
        log.info("TRAIN W2V")
        trainW2Vmodel(corpus)

    #----------------------TRAINING SOM------------------------------------------------
    # load trained model W2V
    w2v_model = Word2Vec.load(conf.get('MAIN', 'path_pickle_w2v_model'))
    log.info("loading W2V model " + conf.get('MAIN', 'path_pickle_w2v_model'))

    # get w2v words, dict words and vectors only for tweet
    embedded_words_t_w, dict_index2word_t_w, dict_word2indext_w = collectWords(
        w2v_model)

    # train SOM: get codebook matrix
    doTrainSom = conf.getboolean('ADVANCED_ASOM', 'do_trainSom')
    if doTrainSom or not os.path.exists(
            conf.get('MAIN', 'path_pickle_som_model')):
        width = int(conf.get('ADVANCED_ASOM', 'width'))
        height = int(conf.get('ADVANCED_ASOM', 'height'))
        empty_codebook_threshold = int(
            conf.getboolean('ADVANCED_ASOM', 'empty_codebook_threshold'))

        log.info("training som [" + str(width) + "x" + str(height) + "]")
        mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w, conf,
                                width, height)

        min_size_codebook_mtx = int(
            conf.get('ADVANCED_ASOM', 'min_size_codebook_mtx'))
        step_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'step_codebook_mtx'))

        # decrease som dimensions if we have more than one codebook empty
        while (not som_ms.isGoodResult(mySom, width, height,
                                       empty_codebook_threshold)
               and width > min_size_codebook_mtx + step_codebook_mtx):
            log.info("training som [" + str(width) + "x" + str(height) + "]")
            width = height = height - 2
            mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w,
                                    conf, width, height)

        save_obj(mySom, conf.get('MAIN', 'path_pickle_som_model'))

    #--------- PREDICT: only on tweets------------------------------------------------

    cleaned_input_list = clean_text_ms.cleanText(input_list)

    # getting only tweets of 3
    cleaned_tweet_rows = []
    tweet_rows = []
    index = 0
    for item in tweet_rows_bool:
        if item == True:
            cleaned_tweet_rows.append(cleaned_input_list[index])
            tweet_rows.append(input_list[index])
        index = index + 1

    # get embedded words from input
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_rows)

    word2VecMS = Word2VecMS(tweet_rows, w2v_model)
    word2VecMS.computeWord2Tweets()
    word2VecMS.saveObject()

    # load SOM
    mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
    log.info("loading SOM model " + conf.get('MAIN', 'path_pickle_som_model'))

    # predict SOM codebooks and plot
    file_name = conf.get('MAIN', 'MST_html_output_file')
    url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word,
                              file_name, "netx")
    file_name = conf.get('MAIN', 'MST_html_d3_output_file')
    url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word,
                              file_name, "d3")

    #--------------------PLOT/PRINT INFO ON SOM---------------------------------
    png = som_ms.getCodebookActivation()

    num_of_topic = conf.getint('GRAPH_IMG', 'num_of_topic_for_frequencies')
    html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model,
                                               mySom, num_of_topic, 'bar')
    html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model,
                                               mySom, num_of_topic, 'bubble')

    png = som_ms.getUmatrix()
    plt.show()

    print som_ms.getCostOfSom()

    #-------------------------KMEANS --------------------------------------------------
    if not os.path.exists(conf.get('MAIN', 'path_pickle_cluster_model')):
        log.info("START CLUSTERING")
        mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
        make_figure = False
        mySom.fit_cluster(cluster_model=None,
                          num_cluster_min=conf.getint('ADVANCED_ASOM',
                                                      'num_cluster_min'),
                          num_cluster_max=conf.getint('ADVANCED_ASOM',
                                                      'num_cluster_max'))

        save_obj(mySom.cluster_model,
                 conf.get('MAIN', 'path_pickle_cluster_model'))
        log.info("saved cluster model in " +
                 conf.get('MAIN', 'path_pickle_cluster_model'))

    # make clustering and plot
    file_name = conf.get('MAIN', 'MST_cluster_csv_output_file')
    url = som_ms.doClusteringAndPlot(cleaned_tweet_rows, file_name)
Beispiel #8
0
def doSomAndPlot1():
    """
        Get entities: Apply SOM and plot result of codebook MST
        Get word2vec model id, som model id, the list of tweet messages or the url of the csv with messages, the type of result and return a result graph
        ---
        parameters:
          - in: body
            name: body
            schema:
              type: object
              properties:
                w2v_model_id:
                  type: string
                  description: id of model to use for word embedding
                som_model_id:
                  type: string
                  description: id of SOM model
                type_chart:
                  type: string
                  description: type of result "d3" (html) of json
                url_input:
                  type: string
                  description: url of the csv with messages
                tweets:
                  type: array
                  items:
                    type: object
                    properties:
                      message:
                        type: string
                        description: tweeet message
            required: true
        responses:
          200:
            description: graph of entities
            schema:
              type: object
              properties:
                directed:
                  type: boolean
                graph:
                  type: object
                links:
                  type: array
                  items:
                    type: object
                    properties:
                      source:
                        type: integer
                      target:
                        type: integer
                multigraph:
                  type: boolean
                nodes:
                  type: array
                  items:
                    type: object
                    properties:
                      id:
                        type: integer
                      name:
                        type: string
                      pos:
                        type: array
                        items:
                          type: integer
          500:
            description: Internal Server Error
            schema:
                type: object
                properties:
                    error:
                     type: string
          299:
            description: Model is still training or not trained
            schema:
                type: object
                properties:
                    warning:
                     type: string
    """
    log.info("/analytics-backend/doSomAndPlot")

    # reading json input
    data_json = json.dumps(request.get_json(silent=True))
    data_json = json.loads(data_json)
    type_chart = data_json["type_chart"]
    w2v_model_id = data_json["w2v_model_id"]
    som_model_id = data_json["som_model_id"]

    if 'url_input' in data_json:
        url_input = data_json["url_input"]
        df = pd.read_csv(url_input)

        # DEBUGGING
        debugging = conf.get('MAIN', 'debugging')
        if (debugging == 'True'):
            document_path_file = conf.get('MAIN', 'path_document')
            df = pd.read_csv(document_path_file,
                             encoding='utf-8',
                             error_bad_lines=False)
            df = df.head()

        input_list = df['message'].tolist()
    else:
        input_list = json.dumps(data_json["tweets"])
        input_list = pd.read_json(input_list,
                                  encoding='utf8')['message'].tolist()

    filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold'
                        ) + "word2vec_" + str(w2v_model_id) + ".pickle"
    try:
        model = word2vec_ms.Word2Vec.load(filename)
    except:
        filename = conf.get(
            'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str(
                w2v_model_id) + "_training.txt"
        return returnModelStatus(filename, w2v_model_id)

    # get embedded words from input
    cleaned_tweet_list = clean_text_ms.cleanText(input_list)
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list, model)

    filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                        ) + "som_" + str(som_model_id) + ".pickle"
    try:
        som_model = som_ms.load_obj(filename)
    except:
        filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                            ) + "som_" + str(som_model_id) + "_training.txt"
        return returnModelStatus(filename, som_model_id)

    file_name = conf.get('MAIN', 'MST_html_d3_output_file')
    response = som_ms.doSomAndPlot(som_model, embedded_words, dict_index2word,
                                   file_name, type_chart)

    if (type_chart == "d3"):
        return render_template('MST_d3.html')
        return html
    elif (type_chart == "json"):
        return jsonify(response)
    else:
        return internalServerError(500)
Beispiel #9
0
def getEmbeddedWords():
    """
        Word embedding
        Get a the id of word2vec model the list of tweet messages and return a list of vector
        ---
        parameters:
          - in: body
            name: body
            schema:
              type: object
              properties:
                w2v_model_id:
                  type: string
                  description: id of model to use for word embedding
                tweets:
                  type: array
                  items:
                    type: object
                    properties:
                      message:
                        type: string
                        description: tweeet message
            required: true
        responses:
          200:
            description: list of embedded words
            schema:
              type: array
              items:
                type: array
                items:
                  type: string
          500:
            description: Internal Server Error
            schema:
                type: object
                properties:
                    error:
                     type: string
          299:
            description: Model is still training or not trained
            schema:
                type: object
                properties:
                    warning:
                     type: string
    """
    log.info("/analytics-backend/getEmbeddedWords")
    data_json = json.dumps(request.get_json(silent=True))
    data_json = json.loads(data_json)

    model_id = data_json["w2v_model_id"]
    input_list = json.dumps(data_json["tweets"])
    input_list = pd.read_json(input_list, encoding='utf8')['message'].tolist()

    filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold'
                        ) + "word2vec_" + str(model_id) + ".pickle"
    try:
        model = word2vec_ms.Word2Vec.load(filename)
    except:
        filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold'
                            ) + "word2vec_" + str(model_id) + "_training.txt"
        return returnModelStatus(filename, model_id)

    embedded_words_tweets, dict_index2word_tweet, dict_word2index_tweet = word2vec_ms.getEmbeddedWords(
        input_list, model)

    list = embedded_words_tweets.tolist()
    return jsonify(list)