def getCodebookActivation(): """ Plot result of codebook Activation Get som model id and return a png of codebook activations --- consumes: - application/json produces: - application/json parameters: - in: query name: som_model_id type: string required: true description: id of SOM model responses: 200: description: png of codebook activations schema: type: object properties: pictures: type: string 500: description: Internal Server Error schema: type: object properties: error: type: string 299: description: Model is still training or not trained schema: type: object properties: warning: type: string """ log.info("/analytics-backend/getCodebookActivation") som_model_id = request.args.get('som_model_id') filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + ".pickle" try: som_model = som_ms.load_obj(filename) except: filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + "_training.txt" return returnModelStatus(filename, som_model_id) png = som_ms.getCodebookActivation(som_model) # f = open(png, 'r+') # data = f.read() import base64 with open(png, "rb") as imageFile: s = base64.b64encode(imageFile.read()) return json.dumps({'picture': s})
def trainCodebookClustering(): """ Train model Codebook Clustering Get the id of SOM model, return the id of the model that will be trained and start in a new thread the training of the model (The training process could take hours) --- parameters: - in: body name: body schema: type: object properties: som_model_id: type: string description: id of model SOM required: true responses: 200: description: Id trained model schema: type: object properties: som_model_id: type: string 500: description: Internal Server Error schema: type: object properties: error: type: string 299: description: Model is still training or not trained schema: type: object properties: warning: type: string """ log.info("/analytics-backend/trainCodebookClustering") data_json = json.dumps(request.get_json(silent=True)) data_json = json.loads(data_json) som_model_id = data_json["som_model_id"] filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + ".pickle" try: som_model = som_ms.load_obj(filename) except: filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + "_training.txt" return returnModelStatus(filename, som_model_id) identifier = core.utility.utilities.getUniqueIdentifier() thread.start_new_thread(som_ms.trainNewModelCodebookCluster, (som_model, identifier)) response = jsonify({"codebook_cluster_model_id": identifier}) return response
def getCostOfSom(): """ Get cost of Som Get som model id and return cost of model --- parameters: - in: query name: som_model_id type: string required: true description: id of SOM model responses: 200: description: cost of som schema: type: object properties: cost of model: type: string 500: description: Internal Server Error schema: type: object properties: error: type: string 299: description: Model is still training or not trained schema: type: object properties: warning: type: string """ log.info("/analytics-backend/getCostOfSom") som_model_id = request.args.get('som_model_id') filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + ".pickle" try: som_model = som_ms.load_obj(filename) except: filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + "_training.txt" return returnModelStatus(filename, som_model_id) cost = som_ms.getCostOfSom(som_model) response = jsonify({"cost of model": cost}) return response
def keywordsExtraction(): """ Keywords Extraction Get a the id of the bigram model, the list of tweet messages or the url of the csv with tweet messages and return the lists of keywords --- parameters: - in: body name: body schema: type: object properties: url_input: type: string description: url of csv with tweet messages bigram_model_id: type: string description: id of bigram model tweets: type: array items: type: object properties: message: type: string description: tweeet message required: true responses: 200: description: lists of keywords schema: type: array items: type: array items: type: string 500: description: Internal Server Error schema: type: object properties: error: type: string 299: description: Model is still training or not trained schema: type: object properties: warning: type: string """ log.info("/analytics-backend/keywordsExtraction") data_json = json.dumps(request.get_json(silent=True)) data_json = json.loads(data_json) if 'url_input' in data_json: url_input = data_json["url_input"] df = pd.read_csv(url_input) # DEBUGGING debugging = conf.get('MAIN', 'debugging') if (debugging == 'True'): document_path_file = conf.get('MAIN', 'path_document') df = pd.read_csv(document_path_file, encoding='utf-8', error_bad_lines=False) df = df.head() input_list = df['message'].tolist() else: input_list = json.dumps(data_json["tweets"]) input_list = pd.read_json(input_list, encoding='utf8')['message'].tolist() bigram_model_id = data_json["bigram_model_id"] filename = conf.get('MAIN', 'path_pickle_bigram_model_incr_fold' ) + "bigram_" + str(bigram_model_id) + ".pickle" try: bigram_model = som_ms.load_obj(filename) except: filename = conf.get( 'MAIN', 'path_pickle_bigram_model_incr_fold') + "bigram_" + str( bigram_model_id) + "_training.txt" return returnModelStatus(filename, bigram_model_id) keywds = text_ranking_ms.extractKeywords(input_list, bigram_model) response = jsonify(keywds) return response
def getCodebookWords(): """ Get all words associated to codebooks Get a the id of word2vec model, the id of SOM model and the list of tweet messages and return the lists of associated words --- parameters: - in: body name: body schema: type: object properties: w2v_model_id: type: string description: id of model to use for word embedding som_model_id: type: string description: id of SOM model tweets: type: array items: type: object properties: message: type: string description: tweeet message required: true responses: 200: description: list of codebook words schema: type: object properties: 0: type: array items: type: string description: codebook word 1: type: array items: type: string description: codebook word ...: type: array items: type: string description: codebook word n: type: array items: type: string description: codebook word 500: description: Internal Server Error schema: type: object properties: error: type: string 299: description: Model is still training or not trained schema: type: object properties: warning: type: string """ log.info("/analytics-backend/getCodebookWords") # reading json input data_json = json.dumps(request.get_json(silent=True)) data_json = json.loads(data_json) w2v_model_id = data_json["w2v_model_id"] som_model_id = data_json["som_model_id"] input_list = json.dumps(data_json["tweets"]) input_list = pd.read_json(input_list, encoding='utf8')['message'].tolist() filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold' ) + "word2vec_" + str(w2v_model_id) + ".pickle" try: w2v_model = word2vec_ms.Word2Vec.load(filename) except: filename = conf.get( 'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str( w2v_model_id) + "_training.txt" return returnModelStatus(filename, w2v_model_id) filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + ".pickle" try: som_model = som_ms.load_obj(filename) except: filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + "_training.txt" return returnModelStatus(filename, som_model_id) codebook2words = som_ms.getCodebookWords(input_list, w2v_model, som_model) response = jsonify(codebook2words) return response
def getCellFrequencyDistribution(): """ Plot frequency distribution Get word2vec model id, som model id, the list of tweet messages or the url of the csv with messages, the type of result ("bubble" or "bar") and return a result graph --- consumes: - application/json produces: - application/json parameters: - in: body name: body schema: type: object properties: w2v_model_id: type: string description: id of model to use for word embedding som_model_id: type: string description: id of SOM model num: type: string description: number of variables sorted from biggest type_chart: type: string description: type of result "bubble" or "bar" url_input: type: string description: url of the csv with messages tweets: type: array items: type: object properties: message: type: string description: tweeet message required: true responses: 200: description: graph of frequencies schema: type: object properties: html: type: string 500: description: Internal Server Error schema: type: object properties: error: type: string 299: description: Model is still training or not trained schema: type: object properties: warning: type: string """ log.info("/analytics-backend/getCellFrequencyDistribution") data_json = json.dumps(request.get_json(silent=True)) data_json = json.loads(data_json) type_chart = data_json["type_chart"] num = data_json["num"] w2v_model_id = data_json["w2v_model_id"] som_model_id = data_json["som_model_id"] if 'url_input' in data_json: url_input = data_json["url_input"] df = pd.read_csv(url_input) # DEBUGGING debugging = conf.get('MAIN', 'debugging') if (debugging == 'True'): document_path_file = conf.get('MAIN', 'path_document') df = pd.read_csv(document_path_file, encoding='utf-8', error_bad_lines=False) df = df.head() input_list = df['message'].tolist() else: input_list = json.dumps(data_json["tweets"]) input_list = pd.read_json(input_list, encoding='utf8')['message'].tolist() filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold' ) + "word2vec_" + str(w2v_model_id) + ".pickle" try: w2v_model = word2vec_ms.Word2Vec.load(filename) except: filename = conf.get( 'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str( w2v_model_id) + "_training.txt" return returnModelStatus(filename, w2v_model_id) filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + ".pickle" try: som_model = som_ms.load_obj(filename) except: filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + "_training.txt" return returnModelStatus(filename, som_model_id) html = som_ms.getCellFrequencyDistribution(input_list, w2v_model, som_model, num, type_chart) return json.dumps({'html': html})
def computeTopics(): """ Extracts topics from a list of tweets Get word2vec model id, som model id, the codebook cluster model id, the list of tweet messages or the url of the csv with messages and return a result graph --- parameters: - in: body name: body schema: type: object properties: w2v_model_id: type: string description: id of model to use for word embedding som_model_id: type: string description: id of SOM model codebook_cluster_model_id: type: string description: id of codebook cluster model tweets: type: array items: type: object properties: message: type: string description: tweeet message required: true responses: 200: description: graphs of topics schema: type: array items: type: object properties: directed: type: boolean graph: type: object links: type: array items: type: object properties: source: type: integer target: type: integer multigraph: type: boolean nodes: type: array items: type: object properties: id: type: integer name: type: string pos: type: array items: type: integer 500: description: Internal Server Error schema: type: object properties: error: type: string 299: description: Model is still training or not trained schema: type: object properties: warning: type: string """ log.info("/analytics-backend/computeTopics") # remove old html topics import glob import os for fl in glob.glob("./templates/dried_*.html"): os.remove(fl) # reading json input data_json = json.dumps(request.get_json(silent=True)) data_json = json.loads(data_json) w2v_model_id = data_json["w2v_model_id"] som_model_id = data_json["som_model_id"] cluster_model_id = data_json["codebook_cluster_model_id"] input_list = json.dumps(data_json["tweets"]) input_list = pd.read_json(input_list, encoding='utf8')['message'].tolist() # load models filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold' ) + "word2vec_" + str(w2v_model_id) + ".pickle" try: w2v_model = word2vec_ms.Word2Vec.load(filename) except: filename = conf.get( 'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str( w2v_model_id) + "_training.txt" return returnModelStatus(filename, w2v_model_id) filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + ".pickle" try: som_model = som_ms.load_obj(filename) except: filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + "_training.txt" return returnModelStatus(filename, som_model_id) filename = conf.get( 'MAIN', 'path_pickle_codebook_cluster_model_incr_fold' ) + "codebook_cluster_" + str(cluster_model_id) + ".pickle" try: cluster_model = som_ms.load_obj(filename) except: conf.get( 'MAIN', 'path_pickle_codebook_cluster_model_incr_fold' ) + "codebook_cluster_" + str(cluster_model_id) + "_training.txt" return returnModelStatus(filename, cluster_model_id) dried_topics = Topics.doSomAndDryTopics(input_list, w2v_model, som_model, cluster_model) graphs = Topics.predictTopics(input_list, w2v_model, som_model, cluster_model, dried_topics, type_chart="json") response = jsonify(graphs) return response
def doSomAndPlot1(): """ Get entities: Apply SOM and plot result of codebook MST Get word2vec model id, som model id, the list of tweet messages or the url of the csv with messages, the type of result and return a result graph --- parameters: - in: body name: body schema: type: object properties: w2v_model_id: type: string description: id of model to use for word embedding som_model_id: type: string description: id of SOM model type_chart: type: string description: type of result "d3" (html) of json url_input: type: string description: url of the csv with messages tweets: type: array items: type: object properties: message: type: string description: tweeet message required: true responses: 200: description: graph of entities schema: type: object properties: directed: type: boolean graph: type: object links: type: array items: type: object properties: source: type: integer target: type: integer multigraph: type: boolean nodes: type: array items: type: object properties: id: type: integer name: type: string pos: type: array items: type: integer 500: description: Internal Server Error schema: type: object properties: error: type: string 299: description: Model is still training or not trained schema: type: object properties: warning: type: string """ log.info("/analytics-backend/doSomAndPlot") # reading json input data_json = json.dumps(request.get_json(silent=True)) data_json = json.loads(data_json) type_chart = data_json["type_chart"] w2v_model_id = data_json["w2v_model_id"] som_model_id = data_json["som_model_id"] if 'url_input' in data_json: url_input = data_json["url_input"] df = pd.read_csv(url_input) # DEBUGGING debugging = conf.get('MAIN', 'debugging') if (debugging == 'True'): document_path_file = conf.get('MAIN', 'path_document') df = pd.read_csv(document_path_file, encoding='utf-8', error_bad_lines=False) df = df.head() input_list = df['message'].tolist() else: input_list = json.dumps(data_json["tweets"]) input_list = pd.read_json(input_list, encoding='utf8')['message'].tolist() filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold' ) + "word2vec_" + str(w2v_model_id) + ".pickle" try: model = word2vec_ms.Word2Vec.load(filename) except: filename = conf.get( 'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str( w2v_model_id) + "_training.txt" return returnModelStatus(filename, w2v_model_id) # get embedded words from input cleaned_tweet_list = clean_text_ms.cleanText(input_list) embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords( cleaned_tweet_list, model) filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + ".pickle" try: som_model = som_ms.load_obj(filename) except: filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold' ) + "som_" + str(som_model_id) + "_training.txt" return returnModelStatus(filename, som_model_id) file_name = conf.get('MAIN', 'MST_html_d3_output_file') response = som_ms.doSomAndPlot(som_model, embedded_words, dict_index2word, file_name, type_chart) if (type_chart == "d3"): return render_template('MST_d3.html') return html elif (type_chart == "json"): return jsonify(response) else: return internalServerError(500)
def textRanking(): """ Text Ranking Get a the id of the bigram and w2v models, the list of tweet messages or the url of the csv with tweet messages and return the text ranking --- parameters: - in: body name: body schema: type: object properties: url_input: type: string description: url of csv with tweet messages bigram_model_id: type: string description: id of bigram model w2v_model_id: type: string description: id of word2vec model tweets: type: array items: type: object properties: message: type: string description: tweeet message required: true responses: 200: description: text ranking schema: type: array items: type: array items: type: string 500: description: Internal Server Error schema: type: object properties: error: type: string """ log.info("/analytics-backend/textRanking") data_json = json.dumps(request.get_json(silent=True)) data_json = json.loads(data_json) if 'url_input' in data_json: url_input = data_json["url_input"] df = pd.read_csv(url_input) # DEBUGGING debugging = conf.get('MAIN', 'debugging') if (debugging == 'True'): document_path_file = conf.get('MAIN', 'path_document') df = pd.read_csv(document_path_file, encoding='utf-8', error_bad_lines=False) df = df.head(100) input_list = df['message'].tolist() else: input_list = json.dumps(data_json["tweets"]) input_list = pd.read_json(input_list, encoding='utf8')['message'].tolist() w2v_model_id = data_json["w2v_model_id"] filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold' ) + "word2vec_" + str(w2v_model_id) + ".pickle" try: #w2v_model = som_ms.load_obj(filename) w2v_model = Word2Vec.load(filename) except: filename = conf.get( 'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str( w2v_model_id) + "_training.txt" return returnModelStatus(filename, w2v_model_id) bigram_model_id = data_json["bigram_model_id"] filename = conf.get('MAIN', 'path_pickle_bigram_model_incr_fold' ) + "bigram_" + str(bigram_model_id) + ".pickle" try: bigram_model = som_ms.load_obj(filename) except: filename = conf.get( 'MAIN', 'path_pickle_bigram_model_incr_fold') + "bigram_" + str( bigram_model_id) + "_training.txt" return returnModelStatus(filename, bigram_model_id) print "input list len" + str(len(input_list)) keywds = text_ranking_ms.extractKeywords(input_list, bigram_model) print "keywds list len" + str(len(keywds)) topics = TopicLabeler.textRanking(keywds, w2v_model) response = jsonify(topics) return response