Python cleanTextの例、core.micro_services.clean_text_ms.cleanText Pythonの例

コード例 #1

0

ファイルを表示

ファイル: text_ranking_ms.py プロジェクト: OpenReqEU/OpenReq-redmine

def extractKeywords(input_list, bigram_model):
    input_list = clean_text_ms.cleanText(input_list)

    # output = []
    # output = splitText(input_list, output)
    keywds_list = []

    i = 0
    status = 0
    for input in input_list:
        keywds, graph, text, ngrams = genspacyrank.extract_keywords(
            text=input,
            # lang='en',
            lang='fr',
            # bigram_model=bigram_model,
            bigram_model=None,
            trigram_model=None,
            selected_pos=['V', 'N', 'J'],
            # rm_stopwords=True
            rm_stopwords=False)
        keywords = []
        for k in keywds:
            keywords.append(k[0])
        keywds_list.append(keywords)
        print "Processing message " + str(i) + " of " + str(len(input_list))
        i = i + 1
        # DEBUGGING
        debugging = conf.get('MAIN', 'debugging')
        if (debugging == 'True'):
            if i == 1000:
                return keywds_list

    # keywds = utilities.convertLisfOfListToList(keywds_list)
    # return keywds
    return keywds_list

コード例 #2

0

ファイルを表示

ファイル: topics.py プロジェクト: OpenReqEU/OpenReq-redmine

def predictTopics(input_list,
                  w2v_model,
                  som_model,
                  cluster_model,
                  dried_topics,
                  type_chart="d3"):
    codebook2cluster = cluster_model.predict(som_model.W)

    cleaned_tweet_list = clean_text_ms.cleanText(input_list)
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list, w2v_model)

    graphs = []
    for index in xrange(codebook2cluster.max() + 1):
        M, words_list = getDriedTopicMatrix(index, dried_topics,
                                            embedded_words, dict_word2index)
        if len(words_list) > 10:
            # file_name_index = './data/output/dried_' + str(index) + '.json'
            file_name = conf.get('MAIN', 'MST_dried_topics_d3_base_file')
            file_name_index = file_name + str(index) + '.html'
            graph = plot_graph.plot_similarity_graph(M, words_list,
                                                     file_name_index,
                                                     type_chart)
            graphs.append(graph)
            print file_name_index

    return graphs

コード例 #3

0

ファイルを表示

ファイル: topics.py プロジェクト: OpenReqEU/analytics-backend

def doSomAndDryTopics(input_list, w2v_model, som_model, clustering_model):
    cleaned_tweet_list = clean_text_ms.cleanText(input_list)
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list, w2v_model)

    data2unit, data2cell, data2dist, data2saliency, data2saliency_index, data2maps = som_model.predict(
        embedded_words)

    log.info("fit cluster...")

    codebook2cluster = clustering_model.predict(som_model.W)

    topics = getTopics(som_model, embedded_words, dict_index2word)
    save_obj( stopwordsDictFromFile(conf.ConfigSectionMap('STOPWORDS_FILES')),
                  conf.get('MAIN', 'path_pickle_stopwords_dict'))

    dried_topics = dryTopics(topics, codebook2cluster, embedded_words, dict_word2index, dict_index2word, 1, conf)
    return dried_topics

コード例 #4

0

ファイルを表示

def createCorpus(cleaned_input_list):
    # -------------------------GET ENTITIES----------------------------------------------------
    log.info("GET ENTITIES")
    entity_list = []
    confidence = conf.getfloat('ENTITY', 'confidence')
    entity_list, tweet_with_entity_list, all_uri = getEntities(
        cleaned_input_list, confidence=confidence)

    # -------------------------GET WIKIPEDIA PAGES---------------------------------------------
    log.info("GET WIKIPEDIA PAGES")
    wikipage_list = getWikipediaPages(all_uri)
    wikipage_list = clean_text_ms.cleanText(wikipage_list)

    # -------------------------CREATE CORPUS---------------------------------------------------
    print log.info("CREATE CORPUS")
    tweet_corpus = createTweetCorpus(wikipage_list, cleaned_input_list,
                                     tweet_with_entity_list)
    corpus = tweet_corpus
    corpus += wikipage_list
    return corpus

コード例 #5

0

ファイルを表示

ファイル: app.py プロジェクト: OpenReqEU/analytics-backend

def cleanText():
    """
        Clean text
        Get a list of tweet message, return a list of cleaned messages
        ---
        parameters:
          - in: body
            name: body
            schema:
              type: array
              items:
                type: object
                properties:
                  message:
                    type: string
                    description: tweeet message
            required: true
        responses:
          200:
            description: Text cleaned
            schema:
                type: array
                items:
                    type: string
          500:
            description: Internal Server Error
            schema:
                type: object
                properties:
                    error:
                     type: string
    """
    log.info("/analytics-backend/cleanText")
    r = request
    data_json = json.dumps(request.get_json(silent=True))
    input_list = pd.read_json(data_json, encoding='utf8')['message'].tolist()

    cleaned_tweet_list = clean_text_ms.cleanText(
        pd.read_json(data_json, encoding='utf8')['message'].tolist())
    return jsonify(cleaned_tweet_list)

コード例 #6

0

ファイルを表示

ファイル: word2vec_ms.py プロジェクト: OpenReqEU/OpenReq-redmine

    def __init__(self, tweets, w2v_model=None):
        # load trained model W2V
        if (w2v_model == None):
            self.model = Word2Vec.load(
                conf.get('MAIN', 'path_pickle_w2v_model'))
        else:
            self.model = w2v_model

        self.vec2tweets = {}
        self.vec2word = {}
        self.word2tweet = {}

        self.tweets = tweets
        self.cleaned_tweets = clean_text_ms.cleanText(tweets)

        if os.path.exists(conf.get('MAIN', 'path_vec2tweets')):
            self.vec2tweets = load_obj(conf.get('MAIN', 'path_vec2tweets'))
        if os.path.exists(conf.get('MAIN', 'path_vec2word')):
            self.vec2word = load_obj(conf.get('MAIN', 'path_vec2word'))
        if os.path.exists(conf.get('MAIN', 'path_word2tweet')):
            self.word2tweet = load_obj(conf.get('MAIN', 'path_word2tweet'))

        self.embedded_words, self.index2word, self.word2index = getEmbeddedWords(
            self.cleaned_tweets, w2v_model)

コード例 #7

0

ファイルを表示

ファイル: tre_tweet_mining.py プロジェクト: OpenReqEU/OpenReq-redmine

def main():
    log.info(
        "----------------------------------START------------------------------------"
    )
    reload(sys)
    sys.setdefaultencoding('utf-8')

    document_path_file = conf.get('MAIN', 'path_document')
    log.info("reading input file: " + document_path_file)

    # ------------------------READ INPUT-------------------------------------------------------
    # read csv into list of string
    input_list = pd.read_csv(document_path_file,
                             encoding='utf-8',
                             error_bad_lines=False)
    # read which rows are from twitter
    source_value = np.array(input_list['idriferimento_ricerca'])
    tweet_rows_bool = (source_value == 5) | (source_value == 6)
    # read all input
    input_list = input_list['messaggio'].tolist()

    # ------------------------CLEANING TEXT---------------------------------------------------
    cleaned_input_list = []
    # read csv file
    path_csv_output_folder = conf.get('MAIN', 'path_csv_output_folder')
    file = path_csv_output_folder + 'cleaned_tweet_list.csv'
    if (os.path.isfile(file)):
        log.info("reading input from file " + file)
        cleaned_input_list = pd.read_csv(file,
                                         encoding='utf-8',
                                         error_bad_lines=False)
        cleaned_input_list = cleaned_input_list['colummn'].tolist()

    if (cleaned_input_list == [] or cleaned_input_list == [[]]):
        log.info("CLEANING TEXT")
        cleaned_input_list = clean_text_ms.cleanText(input_list)

        # write output to csv
        df = pd.DataFrame(cleaned_input_list, columns=["colummn"])
        df.to_csv(file, index=False)
        log.info("file saved in " + file)

    # if word2vec does not exist or rebuild is setted train w2v model
    if not os.path.exists(conf.get('MAIN', 'path_pickle_w2v_model')):
        #-------------------------GET ENTITIES----------------------------------------------------
        log.info("GET ENTITIES")

        entity_list = []
        file_entity_list = path_csv_output_folder + 'entity_list.csv'
        file = file_entity_list
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            # read csv file
            entity_list = pd.read_csv(file,
                                      encoding='utf-8',
                                      error_bad_lines=False)
            entity_list = entity_list['colummn'].tolist()

        tweet_with_entity_list = []
        file_tweet_with_entity_list = path_csv_output_folder + 'tweet_with_entity_list.csv'
        file = file_tweet_with_entity_list
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            tweet_with_entity_list = pd.read_csv(file,
                                                 encoding='utf-8',
                                                 error_bad_lines=False)
            tweet_with_entity_list = tweet_with_entity_list['colummn'].tolist()

        all_uri = []
        file_all_uri = path_csv_output_folder + 'all_uri.csv'
        file = file_all_uri
        if (os.path.isfile(file)):
            log.info("reading input from file " + file)
            all_uri = pd.read_csv(file,
                                  encoding='utf-8',
                                  error_bad_lines=False)
            all_uri = all_uri['colummn'].tolist()

        # get entities
        if (entity_list == [] or entity_list == [[]]):
            confidence = conf.get('ENTITY', 'confidence')
            entity_list, tweet_with_entity_list, all_uri = Corpus.getEntities(
                cleaned_input_list, confidence=confidence)

            file = file_entity_list
            # write output to csv
            df = pd.DataFrame(entity_list, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

            file = file_tweet_with_entity_list
            df = pd.DataFrame(tweet_with_entity_list, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

            file = file_all_uri
            df = pd.DataFrame(all_uri, columns=["colummn"])
            df.to_csv(file, index=False)
            log.info("file saved in " + file)

        with_wiki_pages = conf.get('MAIN', 'with_wiki_pages')
        if (with_wiki_pages == 'False'):
            corpus = cleaned_input_list
        else:
            #-------------------------GET WIKIPEDIA PAGES---------------------------------------------
            log.info("GET WIKIPEDIA PAGES")
            wikipage_list = []

            wikipage_list_file = path_csv_output_folder + 'wikipage_list.csv'
            file = wikipage_list_file
            if (os.path.isfile(file)):
                log.info("reading input from file " + file)
                # read csv file
                wikipage_list = pd.read_csv(file,
                                            encoding='utf-8',
                                            error_bad_lines=False)
                wikipage_list = wikipage_list['colummn'].tolist()

            # get wikipedia page
            if (wikipage_list == [] or wikipage_list == [[]]):
                wikipage_list = Corpus.getWikipediaPages(all_uri)
                wikipage_list = clean_text_ms.cleanText(wikipage_list)

                # write csv
                df = pd.DataFrame(wikipage_list, columns=["colummn"])
                df.to_csv(file, index=False)
                log.info("file saved in " + file)

            #-------------------------CREATE CORPUS---------------------------------------------------
            print log.info("CREATE CORPUS")
            corpus = []

            # read csv file
            corpus_file = path_csv_output_folder + 'corpus.csv'
            file = corpus_file
            if (os.path.isfile(file)):
                log.info("reading input from file " + file)
                # read csv file
                corpus = pd.read_csv(file,
                                     encoding='utf-8',
                                     error_bad_lines=False)
                corpus = corpus['colummn'].tolist()

            # create corpus
            if (corpus == [] or wikipage_list == [[]]):
                tweet_corpus = Corpus.createTweetCorpus(
                    wikipage_list, cleaned_input_list, tweet_with_entity_list)
                corpus = tweet_corpus
                if (USE_WIKIPEDIA_FOR_W2V):
                    corpus += wikipage_list

        corpus_file = path_csv_output_folder + 'corpus.csv'
        file = corpus_file
        # write corpus to csv
        df = pd.DataFrame(corpus, columns=["colummn"])
        df.to_csv(file, index=False)
        log.info("file saved in " + file)

        #-------------------------TRAIN MODEL W2V-------------------------------------------------
        # train model W2v
        log.info("TRAIN W2V")
        trainW2Vmodel(corpus)

    #----------------------TRAINING SOM------------------------------------------------
    # load trained model W2V
    w2v_model = Word2Vec.load(conf.get('MAIN', 'path_pickle_w2v_model'))
    log.info("loading W2V model " + conf.get('MAIN', 'path_pickle_w2v_model'))

    # get w2v words, dict words and vectors only for tweet
    embedded_words_t_w, dict_index2word_t_w, dict_word2indext_w = collectWords(
        w2v_model)

    # train SOM: get codebook matrix
    doTrainSom = conf.getboolean('ADVANCED_ASOM', 'do_trainSom')
    if doTrainSom or not os.path.exists(
            conf.get('MAIN', 'path_pickle_som_model')):
        width = int(conf.get('ADVANCED_ASOM', 'width'))
        height = int(conf.get('ADVANCED_ASOM', 'height'))
        empty_codebook_threshold = int(
            conf.getboolean('ADVANCED_ASOM', 'empty_codebook_threshold'))

        log.info("training som [" + str(width) + "x" + str(height) + "]")
        mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w, conf,
                                width, height)

        min_size_codebook_mtx = int(
            conf.get('ADVANCED_ASOM', 'min_size_codebook_mtx'))
        step_codebook_mtx = int(conf.get('ADVANCED_ASOM', 'step_codebook_mtx'))

        # decrease som dimensions if we have more than one codebook empty
        while (not som_ms.isGoodResult(mySom, width, height,
                                       empty_codebook_threshold)
               and width > min_size_codebook_mtx + step_codebook_mtx):
            log.info("training som [" + str(width) + "x" + str(height) + "]")
            width = height = height - 2
            mySom = som_ms.trainSOM(embedded_words_t_w, dict_index2word_t_w,
                                    conf, width, height)

        save_obj(mySom, conf.get('MAIN', 'path_pickle_som_model'))

    #--------- PREDICT: only on tweets------------------------------------------------

    cleaned_input_list = clean_text_ms.cleanText(input_list)

    # getting only tweets of 3
    cleaned_tweet_rows = []
    tweet_rows = []
    index = 0
    for item in tweet_rows_bool:
        if item == True:
            cleaned_tweet_rows.append(cleaned_input_list[index])
            tweet_rows.append(input_list[index])
        index = index + 1

    # get embedded words from input
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_rows)

    word2VecMS = Word2VecMS(tweet_rows, w2v_model)
    word2VecMS.computeWord2Tweets()
    word2VecMS.saveObject()

    # load SOM
    mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
    log.info("loading SOM model " + conf.get('MAIN', 'path_pickle_som_model'))

    # predict SOM codebooks and plot
    file_name = conf.get('MAIN', 'MST_html_output_file')
    url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word,
                              file_name, "netx")
    file_name = conf.get('MAIN', 'MST_html_d3_output_file')
    url = som_ms.doSomAndPlot(mySom, embedded_words, dict_index2word,
                              file_name, "d3")

    #--------------------PLOT/PRINT INFO ON SOM---------------------------------
    png = som_ms.getCodebookActivation()

    num_of_topic = conf.getint('GRAPH_IMG', 'num_of_topic_for_frequencies')
    html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model,
                                               mySom, num_of_topic, 'bar')
    html = som_ms.getCellFrequencyDistribution(cleaned_tweet_rows, w2v_model,
                                               mySom, num_of_topic, 'bubble')

    png = som_ms.getUmatrix()
    plt.show()

    print som_ms.getCostOfSom()

    #-------------------------KMEANS --------------------------------------------------
    if not os.path.exists(conf.get('MAIN', 'path_pickle_cluster_model')):
        log.info("START CLUSTERING")
        mySom = load_obj(conf.get('MAIN', 'path_pickle_som_model'))
        make_figure = False
        mySom.fit_cluster(cluster_model=None,
                          num_cluster_min=conf.getint('ADVANCED_ASOM',
                                                      'num_cluster_min'),
                          num_cluster_max=conf.getint('ADVANCED_ASOM',
                                                      'num_cluster_max'))

        save_obj(mySom.cluster_model,
                 conf.get('MAIN', 'path_pickle_cluster_model'))
        log.info("saved cluster model in " +
                 conf.get('MAIN', 'path_pickle_cluster_model'))

    # make clustering and plot
    file_name = conf.get('MAIN', 'MST_cluster_csv_output_file')
    url = som_ms.doClusteringAndPlot(cleaned_tweet_rows, file_name)

コード例 #8

0

ファイルを表示

ファイル: app.py プロジェクト: OpenReqEU/analytics-backend

def doSomAndPlot1():
    """
        Get entities: Apply SOM and plot result of codebook MST
        Get word2vec model id, som model id, the list of tweet messages or the url of the csv with messages, the type of result and return a result graph
        ---
        parameters:
          - in: body
            name: body
            schema:
              type: object
              properties:
                w2v_model_id:
                  type: string
                  description: id of model to use for word embedding
                som_model_id:
                  type: string
                  description: id of SOM model
                type_chart:
                  type: string
                  description: type of result "d3" (html) of json
                url_input:
                  type: string
                  description: url of the csv with messages
                tweets:
                  type: array
                  items:
                    type: object
                    properties:
                      message:
                        type: string
                        description: tweeet message
            required: true
        responses:
          200:
            description: graph of entities
            schema:
              type: object
              properties:
                directed:
                  type: boolean
                graph:
                  type: object
                links:
                  type: array
                  items:
                    type: object
                    properties:
                      source:
                        type: integer
                      target:
                        type: integer
                multigraph:
                  type: boolean
                nodes:
                  type: array
                  items:
                    type: object
                    properties:
                      id:
                        type: integer
                      name:
                        type: string
                      pos:
                        type: array
                        items:
                          type: integer
          500:
            description: Internal Server Error
            schema:
                type: object
                properties:
                    error:
                     type: string
          299:
            description: Model is still training or not trained
            schema:
                type: object
                properties:
                    warning:
                     type: string
    """
    log.info("/analytics-backend/doSomAndPlot")

    # reading json input
    data_json = json.dumps(request.get_json(silent=True))
    data_json = json.loads(data_json)
    type_chart = data_json["type_chart"]
    w2v_model_id = data_json["w2v_model_id"]
    som_model_id = data_json["som_model_id"]

    if 'url_input' in data_json:
        url_input = data_json["url_input"]
        df = pd.read_csv(url_input)

        # DEBUGGING
        debugging = conf.get('MAIN', 'debugging')
        if (debugging == 'True'):
            document_path_file = conf.get('MAIN', 'path_document')
            df = pd.read_csv(document_path_file,
                             encoding='utf-8',
                             error_bad_lines=False)
            df = df.head()

        input_list = df['message'].tolist()
    else:
        input_list = json.dumps(data_json["tweets"])
        input_list = pd.read_json(input_list,
                                  encoding='utf8')['message'].tolist()

    filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold'
                        ) + "word2vec_" + str(w2v_model_id) + ".pickle"
    try:
        model = word2vec_ms.Word2Vec.load(filename)
    except:
        filename = conf.get(
            'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str(
                w2v_model_id) + "_training.txt"
        return returnModelStatus(filename, w2v_model_id)

    # get embedded words from input
    cleaned_tweet_list = clean_text_ms.cleanText(input_list)
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list, model)

    filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                        ) + "som_" + str(som_model_id) + ".pickle"
    try:
        som_model = som_ms.load_obj(filename)
    except:
        filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                            ) + "som_" + str(som_model_id) + "_training.txt"
        return returnModelStatus(filename, som_model_id)

    file_name = conf.get('MAIN', 'MST_html_d3_output_file')
    response = som_ms.doSomAndPlot(som_model, embedded_words, dict_index2word,
                                   file_name, type_chart)

    if (type_chart == "d3"):
        return render_template('MST_d3.html')
        return html
    elif (type_chart == "json"):
        return jsonify(response)
    else:
        return internalServerError(500)

コード例 #9

0

ファイルを表示

ファイル: app.py プロジェクト: OpenReqEU/analytics-backend

def trainWord2Vec():
    """
        Train word 2 vec model
        Get a list of tweet message or the url of the csv with tweet messages, return the id of the model that will be trained and start in a new thread the training of the model (The training process takes hours)
        ---
        parameters:
          - in: body
            name: body
            schema:
              type: object
              properties:
                url_input:
                  type: string
                  description: url of csv with tweet messages
                tweets:
                  type: array
                  items:
                    type: object
                    properties:
                      message:
                        type: string
                        description: tweeet message
            required: true
        responses:
          200:
            description: Id trained Model
            schema:
                type: object
                properties:
                  w2v_model_id:
                    type: string
          500:
            description: Internal Server Error
            schema:
                type: object
                properties:
                    error:
                     type: string
    """
    log.info("/analytics-backend/trainWord2Vec")
    data_json = json.dumps(request.get_json(silent=True))
    data_json = json.loads(data_json)

    if 'url_input' in data_json:
        url_input = data_json["url_input"]
        df = pd.read_csv(url_input)

        # DEBUGGING
        debugging = conf.get('MAIN', 'debugging')
        if (debugging == 'True'):
            document_path_file = conf.get('MAIN', 'path_document')
            df = pd.read_csv(document_path_file,
                             encoding='utf-8',
                             error_bad_lines=False)
            df = df.head(100)

        input_list = df['message'].tolist()
    else:
        input_list = json.dumps(data_json["tweets"])
        input_list = pd.read_json(input_list,
                                  encoding='utf8')['message'].tolist()

    cleaned_input_list = clean_text_ms.cleanText(input_list)
    corpus = Corpus.createCorpus(cleaned_input_list)

    identifier = core.utility.utilities.getUniqueIdentifier()
    thread.start_new_thread(word2vec_ms.trainNewModelW2Vmodel,
                            (corpus, identifier))

    response = jsonify({"w2v_model_id": identifier})
    return response