コード例 #1
0
def PPR():
    todayDate = graphUtils.getTodayDateFolder()
    lastRecommendationnDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPH_RECOMM_DONE)
    #lastRecommendationnDate = None
    if todayDate == lastRecommendationnDate:
        graphUtils.logger.info(
            "Simple Graph recommendation PPR done for today ")
        return
    graphUtils.logger.info("Simple graph recommendation PPR last done for =" +
                           str(lastRecommendationnDate))
    #Get the current version of stored graphs
    G = None
    graph_path = os.path.join(graphConstants.ROOT_FOLDER,
                              graphConstants.GRAPH_DIR,
                              graphConstants.GRAPH_DIR,
                              graphConstants.TYPE_MAIN)
    graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    G = nx.read_gexf(graph_file)

    list_nodes = {x: i for i, x in enumerate(G.nodes())}
    R = get_init_R(G, list_nodes)

    #Normalize edge transition weights
    M = normalize_edge_Weights(list_nodes, G)

    S, list_seednode_names = graphSeedNodes.findSeedNodes(G, list_nodes)
    for idx, node in enumerate(list_seednode_names):
        graphUtils.logger.info(
            str(idx) + " seed node for simple graph today = " + node)
    newR = personalizedPageRank(R, M, S)
    printGraphRecommendedDocs(G, list_nodes, newR)
    writeNewR(G, list_nodes, newR, graph_file)
    graphUtils.saveSettings(graphConstants.LAST_GRAPH_RECOMM_DONE, todayDate)
    pass
コード例 #2
0
ファイル: graphRelevance.py プロジェクト: weberna/SuPrIA
def Relevance():
    todayDate = graphUtils.getTodayDateFolder()
    lastRelevanceDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPH_RELEVANCE_DIR)
    lastSuggRelevanceDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPH_SUGG_RELEVANCE_DIR)

    if lastRelevanceDate:
        graphUtils.logger.info("Graph Relevance done last for =" +
                               lastRelevanceDate)
    else:
        graphUtils.logger.info("Graph Relevance done last for None")

    if lastSuggRelevanceDate:
        graphUtils.logger.info("GraphSugg Relevance done last for =" +
                               lastSuggRelevanceDate)
    else:
        graphUtils.logger.info("GraphSugg Relevance done last for None")

    if todayDate == lastRelevanceDate and todayDate == lastSuggRelevanceDate:
        graphUtils.logger.info(
            "Graph Relevance signal already done for today :" + todayDate)
        return True
    graph_path = os.path.join(graphConstants.ROOT_FOLDER,
                              graphConstants.GRAPH_DIR,
                              graphConstants.GRAPH_DIR,
                              graphConstants.TYPE_MAIN)
    graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    write_graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    if not os.path.exists(graph_path):
        os.makedirs(graph_path)
    G = nx.read_gexf(graph_file)
    trainFiles, trainFileNames = graphUtils.findRecommTrainGraphFiles()
    trainCorpus = graphUtils.findCorpus(trainFiles)
    all_tokens = sum(trainCorpus, [])
    tokens_once = set(word for word in set(all_tokens)
                      if all_tokens.count(word) == 1)
    texts = [[word for word in text if word not in tokens_once]
             for text in trainCorpus]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(corpus=corpus,
                              id2word=dictionary,
                              normalize=True)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                num_features=len(dictionary))

    if todayDate != lastRelevanceDate:
        testFiles, testFileName = graphUtils.findRecommFiles()
        testCorpus = graphUtils.findCorpus(testFiles)
        mini = 100
        maxi = -1
        count = 0
        for idx, text in enumerate(testCorpus):
            #Add this recommendation node
            recomm_nodename = testFileName[idx]
            if recomm_nodename not in G.nodes():
                G.add_node(recomm_nodename)
                G.node[recomm_nodename]['type'] = graphConstants.TYPE_GOOGLE
            vec = dictionary.doc2bow(text)
            sims = index[tfidf[vec]]
            for idxsim, prob in enumerate(sims):
                if prob < 0.1:
                    continue
                trainNode = trainFileNames[idxsim]
                if trainNode in G.nodes():
                    if prob < mini:
                        mini = prob
                    if prob > maxi:
                        maxi = prob
                    G.add_edge(recomm_nodename, trainNode, weight=prob)
                    G.add_edge(trainNode, recomm_nodename, weight=prob)
                    count = count + 1
            text = readFromFile(testFiles[idx])
            #NERFunc(text,G, recomm_nodename)
        graphUtils.logger.info(
            "Simple graph relevance completed for today. Stats follow")
        graphUtils.logger.info("mini =" + str(mini))
        graphUtils.logger.info("maxi =" + str(maxi))
        graphUtils.logger.info("Relevance count =" + str(count))
        nx.write_gexf(G, write_graph_file)
        graphUtils.saveSettings(graphConstants.LAST_GRAPH_RELEVANCE_DIR,
                                todayDate)

    if todayDate != lastRelevanceDate:
        testFiles, testFileName = graphUtils.findSuggRecommFiles()
        testCorpus = graphUtils.findCorpus(testFiles)
        mini = 100
        maxi = -1
        count = 0
        for idx, text in enumerate(testCorpus):
            #Add this recommendation node
            recomm_nodename = testFileName[idx]
            if recomm_nodename not in G.nodes():
                G.add_node(recomm_nodename)
                G.node[recomm_nodename]['type'] = graphConstants.TYPE_SUGG
            vec = dictionary.doc2bow(text)
            sims = index[tfidf[vec]]
            for idxsim, prob in enumerate(sims):
                if prob < 0.1:
                    continue
                trainNode = trainFileNames[idxsim]
                if trainNode in G.nodes():
                    if prob < mini:
                        mini = prob
                    if prob > maxi:
                        maxi = prob
                    G.add_edge(recomm_nodename, trainNode, weight=prob)
                    G.add_edge(trainNode, recomm_nodename, weight=prob)
                    count = count + 1
            text = readFromFile(testFiles[idx])
            #NERFunc(text,G, recomm_nodename)
        graphUtils.logger.info(
            "Simple graph relevance completed for suggestGoogle today. Stats follow"
        )
        graphUtils.logger.info("mini =" + str(mini))
        graphUtils.logger.info("maxi =" + str(maxi))
        graphUtils.logger.info("Relevance count =" + str(count))
        nx.write_gexf(G, write_graph_file)
        graphUtils.saveSettings(graphConstants.LAST_GRAPH_SUGG_RELEVANCE_DIR,
                                todayDate)
    pass
コード例 #3
0
def buildGraph():
    #Load up the current graph we may have
    
    
    #Last Graph date done
    date_LAST_GRAPH_DONE = graphUtils.loadSettings(graphConstants.date_LAST_GRAPH_DONE)
    date_LAST_TEXTCORPUS_DONE = graphUtils.loadSettings(graphConstants.date_LAST_TEXTCORPUS_DONE)
    date_LAST_LDA_DONE = graphUtils.loadSettings(graphConstants.date_LAST_LDA_DONE)
    yesterdayFolder = graphUtils.getYesterdayDateFolder()
    if date_LAST_GRAPH_DONE == yesterdayFolder:
        graphUtils.logger.info("simple Graph already built till yesterday")
        return
    graphFiles, graphFileNames = graphUtils.findGraphFiles()
    graphCorpus = graphUtils.findCorpus(graphFiles)
    all_tokens = sum(graphCorpus, [])
    tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
    new_texts = [[word for word in text if word not in tokens_once]
         for text in graphCorpus]
    pass
    
    #Retrieve text corpus
    txtcorpus_path = os.path.join(graphConstants.ROOT_FOLDER,graphConstants.GRAPH_DIR, graphConstants.TEXTCORPUS_DIR, graphConstants.TYPE_MAIN)
    txtcorpus_file = os.path.join(txtcorpus_path,graphConstants.TEXTCORPUS_FILE)
    if not os.path.exists(txtcorpus_path):
        os.makedirs(txtcorpus_path)
    objtxt_corpus = None
    if date_LAST_TEXTCORPUS_DONE == None:
        objtxt_corpus = MyCorpus(new_texts)
        objtxt_corpus.save(txtcorpus_file)
        graphUtils.saveSettings(graphConstants.date_LAST_TEXTCORPUS_DONE, yesterdayFolder)
    elif date_LAST_TEXTCORPUS_DONE != yesterdayFolder:
        objtxt_corpus = MyCorpus.load(txtcorpus_file)
        objtxt_corpus.update_corpus(new_texts)
        objtxt_corpus.save(txtcorpus_file)
        graphUtils.saveSettings(graphConstants.date_LAST_TEXTCORPUS_DONE, yesterdayFolder)
    else:
        objtxt_corpus = MyCorpus.load(txtcorpus_file)
    
    txt_dictionary = objtxt_corpus.dictionary
    corpus = [txt_dictionary.doc2bow(text) for text in objtxt_corpus.corpus]
    newtxt_corpus = [txt_dictionary.doc2bow(text) for text in new_texts]
    
    tfidf = models.TfidfModel(corpus=corpus, id2word=txt_dictionary,normalize=True)
    idf = models.tfidfmodel.precompute_idfs(tfidf.wglobal,txt_dictionary.dfs,len(corpus))
    
    if date_LAST_LDA_DONE!= None:
        graphUtils.logger.info("Simple graph nodes lda after ="+date_LAST_LDA_DONE+" starts")
    else:
        graphUtils.logger.info("Simple graph nodes lda after = none starts")
    t0 = time()
    #Do lda
    #Retrieve text corpus
    lda_path = os.path.join(graphConstants.ROOT_FOLDER,graphConstants.GRAPH_DIR, graphConstants.LDA_DIR, graphConstants.TYPE_MAIN)
    lda_file = os.path.join(lda_path,graphConstants.LDA_FILE)
    lda = None
    if not os.path.exists(lda_path):
        os.makedirs(lda_path)
    date_LAST_LDA_DONE = None
    if date_LAST_LDA_DONE == None:
        lda = models.LdaModel(corpus=corpus, id2word=txt_dictionary, num_topics=50, \
                               update_every=1, chunksize=10000, passes=LDA_PASSES)
        lda.save(lda_file)
        graphUtils.saveSettings(graphConstants.date_LAST_LDA_DONE, yesterdayFolder)
    elif date_LAST_LDA_DONE != yesterdayFolder:
        lda = models.LdaModel.load(lda_file)
        lda.update(newtxt_corpus)
        lda.save(lda_file)
        graphUtils.saveSettings(graphConstants.date_LAST_LDA_DONE, yesterdayFolder)
    else:
        lda = models.LdaModel.load(lda_file)
    
    t1= time()
    graphUtils.logger.info("Simple graph nodes lda time ="+str(t1-t0)+" seconds ends")
    #Develop graph
    G = None
    graph_path = os.path.join(graphConstants.ROOT_FOLDER,graphConstants.GRAPH_DIR, graphConstants.GRAPH_DIR, graphConstants.TYPE_MAIN)
    graph_file = os.path.join(graph_path,graphConstants.GRAPH_FILE)
    if not os.path.exists(graph_path):
        os.makedirs(graph_path)
    if date_LAST_GRAPH_DONE == None:
        G=nx.DiGraph()    
    elif date_LAST_GRAPH_DONE != yesterdayFolder:
        G = nx.read_gexf(graph_file)
    
    if date_LAST_GRAPH_DONE != None:
        graphUtils.logger.info("Simple graph nodes addition after ="+date_LAST_GRAPH_DONE+" starts")
    else:
        graphUtils.logger.info("Simple graph nodes addition after = None")
    #Add nodes and edges for current new corpus which is supposed to be added    
    if date_LAST_GRAPH_DONE != yesterdayFolder:
        for topic in [50]:
            for index,document in enumerate(newtxt_corpus):
                node_name = graphFileNames[index]
                G.add_node(node_name)
                G.node[node_name]['type'] = graphConstants.TYPE_HISTORY
                topics = lda[document]
                #print "Document start"
                for topicObj,topicProb in topics:
                    #Compare topicProb with some threshold value
                    if topicProb > 0.1:
                        topicid = topicObj
                        words = lda.show_topic(topicid, topn=10)
                        for wordProb, word in words:
                            wordId = txt_dictionary.doc2bow([word])[0][0]
                            idfWord = idf[wordId]
                            if idfWord > 3.0:
                                word = word.lower()
                                #If this topic doesn't exist as a node then add it
                                if word not in G.nodes():
                                    G.add_node(word)
                                    G.node[word]['type'] = graphConstants.TYPE_TOPICS
                                #If the edge between this doc and topic is already present or not
                                if G.has_edge(node_name,word) is False:
                                    G.add_edge(node_name,word, weight = 1)
                                else:
                                    G[node_name][word]["weight"] = G[node_name][word]["weight"] + 1
                                if G.has_edge(word,node_name) is False:
                                    G.add_edge(word,node_name, weight = 1)
                                else:
                                    G[word][node_name]["weight"] = G[word][node_name]["weight"] + 1
                               # print "word = "+word + " document ="+node_name
                                graphUtils.logger.info("word = "+word + " document ="+node_name)
                                #print "Word = "+word + " idf="+str(idfWord)
                            pass
                        
                        
            #f=open('lda_topics_'+str(topic)+'_'+str(LDA_PASSES)+'_'+'.txt','w')
            # Prints the topics.
            #for top in lda.print_topics(num_words=1000,num_topics=topic):
            #    f.write(top+' \n')
            #print 'Document topic printed for'+str(topic)
        #G = NER.NERFunc(graphFiles, graphFileNames, G)
        nx.write_gexf(G, graph_file)
        graphUtils.saveSettings(graphConstants.date_LAST_GRAPH_DONE, yesterdayFolder)
        graphUtils.logger.info("Simple graph nodes addition after ="+str(date_LAST_GRAPH_DONE)+" ends")
        
コード例 #4
0
def Smoothness():
    todayDate = graphUtils.getTodayDateFolder()
    lastSmoothnessDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPHNER_SMOOTHNESS_DIR)
    lastSuggSmoothnessDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPHNER_SUGG_SMOOTHNESS_DIR)

    if lastSmoothnessDate:
        graphUtils.logger.info("NERGraph Smoothness done last for =" +
                               lastSmoothnessDate)
    else:
        graphUtils.logger.info("NERGraph Smoothness done last for None")

    if lastSuggSmoothnessDate:
        graphUtils.logger.info("NERGraphSugg Smoothness done last for =" +
                               lastSuggSmoothnessDate)
    else:
        graphUtils.logger.info("NERGraphSugg Smoothness done last for None")

    if todayDate == lastSmoothnessDate and todayDate == lastSuggSmoothnessDate:
        graphUtils.logger.info(
            "NERGraph Smoothness signal already done for today :" + todayDate)
        return True
    graph_path = os.path.join(graphConstants.ROOT_FOLDER,
                              graphConstants.GRAPH_DIR,
                              graphConstants.GRAPH_DIR,
                              graphConstants.TYPE_NER)
    graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    write_graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    if not os.path.exists(graph_path):
        os.makedirs(graph_path)
    G = nx.read_gexf(graph_file)
    trainFiles, trainFileNames = graphUtils.findRecommTrainGraphNerFiles()
    trainCorpus = graphUtils.findCorpus(trainFiles)
    bm25obj = Bm25(trainCorpus)
    trainUniqueWords = []
    for trainText in trainCorpus:
        trainUniqueWords.append(set(trainText))

    if todayDate != lastSmoothnessDate:
        testFiles, testFileName = graphUtils.findRecommFiles()
        testCorpus = graphUtils.findCorpus(testFiles)
        testUniqueWords = []
        mini = 100
        maxi = -1
        count = 0
        smoothness = zeros((len(testCorpus), len(trainCorpus)))
        for testText in testCorpus:
            testUniqueWords.append(set(testText))
        for testDoc in range(len(testCorpus)):
            recomm_nodename = testFileName[testDoc]
            uniqueTest = testUniqueWords[testDoc]
            SminusDcontext = zeros(bm25obj.N)
            DminusScontext = zeros(bm25obj.N)
            for trainDoc in range(len(trainCorpus)):
                uniqueTrain = trainUniqueWords[trainDoc]
                SminusD = [
                    word for word in trainCorpus[trainDoc]
                    if word not in uniqueTest
                ]
                DminusS = [
                    word for word in testCorpus[testDoc]
                    if word not in uniqueTrain
                ]
                SminusDcontext = bm25obj.BM25Score(SminusD)
                DminusScontext = bm25obj.BM25Score(DminusS)
                smoothness[testDoc][trainDoc] = np.dot(SminusDcontext,
                                                       DminusScontext)
            dict_arr = {
                key: value
                for (key, value) in enumerate(smoothness[testDoc])
            }
            sorted_x = sorted(dict_arr.items(), key=operator.itemgetter(1))
            sorted_x.reverse()
            sorted_x = sorted_x[:graphConstants.MAX_SMOOTHNESS_EDGE]
            total = sum([pair[1] for pair in sorted_x])
            for (idxsim, val) in sorted_x:
                prob = val / total
                if recomm_nodename not in G.nodes():
                    G.add_node(recomm_nodename)
                    G.node[recomm_nodename][
                        'type'] = graphConstants.TYPE_GOOGLE
                trainNode = trainFileNames[idxsim]
                if trainNode in G.nodes():
                    if prob < mini:
                        mini = prob
                    if prob > maxi:
                        maxi = prob
                    if G.has_edge(recomm_nodename, trainNode) is False:
                        G.add_edge(recomm_nodename,
                                   trainNode,
                                   weight=prob *
                                   graphConstants.SMOOTHNESS_EDGE_WEIGHT)
                    else:
                        G[recomm_nodename][trainNode][
                            'weight'] = G[recomm_nodename][trainNode][
                                'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT

                    if G.has_edge(trainNode, recomm_nodename) is False:
                        G.add_edge(trainNode,
                                   recomm_nodename,
                                   weight=prob *
                                   graphConstants.SMOOTHNESS_EDGE_WEIGHT)
                    else:
                        G[trainNode][recomm_nodename][
                            'weight'] = G[trainNode][recomm_nodename][
                                'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT
                    count = count + 1

            #print smoothness[testDoc]
        graphUtils.logger.info(
            " ner graph Smoothness completed for normalGoogle today. Stats follow"
        )
        graphUtils.logger.info("mini =" + str(mini))
        graphUtils.logger.info("maxi =" + str(maxi))
        graphUtils.logger.info("Smoothness edges count =" + str(count))
        nx.write_gexf(G, write_graph_file)
        graphUtils.saveSettings(graphConstants.LAST_GRAPHNER_SMOOTHNESS_DIR,
                                todayDate)
        pass

    if todayDate != lastSuggSmoothnessDate:
        testFiles, testFileName = graphUtils.findSuggRecommFiles()
        testCorpus = graphUtils.findCorpus(testFiles)
        testUniqueWords = []
        mini = 100
        maxi = -1
        count = 0
        smoothness = zeros((len(testCorpus), len(trainCorpus)))
        for testText in testCorpus:
            testUniqueWords.append(set(testText))
        for testDoc in range(len(testCorpus)):
            recomm_nodename = testFileName[testDoc]
            uniqueTest = testUniqueWords[testDoc]
            SminusDcontext = zeros(bm25obj.N)
            DminusScontext = zeros(bm25obj.N)
            for trainDoc in range(len(trainCorpus)):
                uniqueTrain = trainUniqueWords[trainDoc]
                SminusD = [
                    word for word in trainCorpus[trainDoc]
                    if word not in uniqueTest
                ]
                DminusS = [
                    word for word in testCorpus[testDoc]
                    if word not in uniqueTrain
                ]
                SminusDcontext = bm25obj.BM25Score(SminusD)
                DminusScontext = bm25obj.BM25Score(DminusS)
                smoothness[testDoc][trainDoc] = np.dot(SminusDcontext,
                                                       DminusScontext)
            dict_arr = {
                key: value
                for (key, value) in enumerate(smoothness[testDoc])
            }
            sorted_x = sorted(dict_arr.items(), key=operator.itemgetter(1))
            sorted_x.reverse()
            sorted_x = sorted_x[:graphConstants.MAX_SMOOTHNESS_EDGE]
            total = sum([pair[1] for pair in sorted_x])
            for (idxsim, val) in sorted_x:
                prob = val / total
                if recomm_nodename not in G.nodes():
                    G.add_node(recomm_nodename)
                    G.node[recomm_nodename]['type'] = graphConstants.TYPE_SUGG
                trainNode = trainFileNames[idxsim]
                if trainNode in G.nodes():
                    if prob < mini:
                        mini = prob
                    if prob > maxi:
                        maxi = prob
                    if G.has_edge(recomm_nodename, trainNode) is False:
                        G.add_edge(recomm_nodename,
                                   trainNode,
                                   weight=prob *
                                   graphConstants.SMOOTHNESS_EDGE_WEIGHT)
                    else:
                        G[recomm_nodename][trainNode][
                            'weight'] = G[recomm_nodename][trainNode][
                                'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT

                    if G.has_edge(trainNode, recomm_nodename) is False:
                        G.add_edge(trainNode,
                                   recomm_nodename,
                                   weight=prob *
                                   graphConstants.SMOOTHNESS_EDGE_WEIGHT)
                    else:
                        G[trainNode][recomm_nodename][
                            'weight'] = G[trainNode][recomm_nodename][
                                'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT
                    count = count + 1

            #print smoothness[testDoc]
        graphUtils.logger.info(
            " ner graph Smoothness completed for suggestGoogle today. Stats follow"
        )
        graphUtils.logger.info("mini =" + str(mini))
        graphUtils.logger.info("maxi =" + str(maxi))
        graphUtils.logger.info("Smoothness edges count =" + str(count))
        nx.write_gexf(G, write_graph_file)
        graphUtils.saveSettings(
            graphConstants.LAST_GRAPHNER_SUGG_SMOOTHNESS_DIR, todayDate)
        pass
コード例 #5
0
ファイル: graphClarity.py プロジェクト: weberna/SuPrIA
def ConnectionClarity():
    todayDate = graphUtils.getYesterdayDateFolder()
    lastClarityDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPH_CLARITY_DIR)
    lastSuggClarityDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPH_SUGG_CLARITY_DIR)
    if lastClarityDate:
        graphUtils.logger.info("Graph Google Clarity done last for =" +
                               lastClarityDate)
    else:
        graphUtils.logger.info("Graph Google Clarity done last for none")

    if lastSuggClarityDate:
        graphUtils.logger.info("Graph Sugg Clarity done last for =" +
                               lastSuggClarityDate)
    else:
        graphUtils.logger.info("Graph Sugg Clarity done last for none")

    if todayDate == lastClarityDate and todayDate == lastSuggClarityDate:
        graphUtils.logger.info("graph Clarity signal done for today =" +
                               todayDate)
        return True

    graph_path = os.path.join(graphConstants.ROOT_FOLDER,
                              graphConstants.GRAPH_DIR,
                              graphConstants.GRAPH_DIR,
                              graphConstants.TYPE_MAIN)
    graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    write_graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    if not os.path.exists(graph_path):
        os.makedirs(graph_path)
    G = nx.read_gexf(graph_file)
    trainFiles, trainFileNames = graphUtils.findRecommTrainGraphFiles()

    trainCorpus = graphUtils.findCorpus(trainFiles)
    if todayDate != lastClarityDate:
        testFiles, testFileName = graphUtils.findRecommFiles()
        testCorpus = graphUtils.findCorpus(testFiles)
        clarityobj = Clarity(trainCorpus, testCorpus)
        clarityScore = clarityobj.ClarityScore()
        mini = 100
        maxi = -1
        count = 0
        for testidx, text in enumerate(testCorpus):
            recomm_nodename = testFileName[testidx]
            dict_arr = {
                key: value
                for (key, value) in enumerate(clarityScore[testidx])
            }
            sorted_x = sorted(dict_arr.items(), key=operator.itemgetter(1))
            sorted_x.reverse()
            sorted_x = sorted_x[:graphConstants.MAX_CLARITY_EDGE]
            total = sum([pair[1] for pair in sorted_x])
            for (idxsim, val) in sorted_x:
                prob = val / total
                if prob < 0.0:
                    break
                if recomm_nodename not in G.nodes():
                    G.add_node(recomm_nodename)
                    G.node[recomm_nodename][
                        'type'] = graphConstants.TYPE_GOOGLE
                trainNode = trainFileNames[idxsim]
                if trainNode in G.nodes():
                    if prob < mini:
                        mini = prob
                    if prob > maxi:
                        maxi = prob
                    if G.has_edge(recomm_nodename, trainNode) is False:
                        G.add_edge(recomm_nodename,
                                   trainNode,
                                   weight=prob *
                                   graphConstants.CLARITY_EDGE_WEIGHT)
                    else:
                        G[recomm_nodename][trainNode][
                            'weight'] = G[recomm_nodename][trainNode][
                                'weight'] + prob * graphConstants.CLARITY_EDGE_WEIGHT

                    if G.has_edge(trainNode, recomm_nodename) is False:
                        G.add_edge(trainNode,
                                   recomm_nodename,
                                   weight=prob *
                                   graphConstants.CLARITY_EDGE_WEIGHT)
                    else:
                        G[trainNode][recomm_nodename][
                            'weight'] = G[trainNode][recomm_nodename][
                                'weight'] + prob * graphConstants.CLARITY_EDGE_WEIGHT

                    count = count + 1

        graphUtils.logger.info(
            "Simple  graph clarity completed for googlenews today. Stats follow"
        )
        graphUtils.logger.info("mini =" + str(mini))
        graphUtils.logger.info("maxi =" + str(maxi))
        graphUtils.logger.info("clarity edges count =" + str(count))
        nx.write_gexf(G, write_graph_file)
        graphUtils.saveSettings(graphConstants.LAST_GRAPH_CLARITY_DIR,
                                todayDate)
        pass

    if todayDate != lastSuggClarityDate:
        testFiles, testFileName = graphUtils.findSuggRecommFiles()
        testCorpus = graphUtils.findCorpus(testFiles)
        clarityobj = Clarity(trainCorpus, testCorpus)
        clarityScore = clarityobj.ClarityScore()
        mini = 100
        maxi = -1
        count = 0
        for testidx, text in enumerate(testCorpus):
            recomm_nodename = testFileName[testidx]
            dict_arr = {
                key: value
                for (key, value) in enumerate(clarityScore[testidx])
            }
            sorted_x = sorted(dict_arr.items(), key=operator.itemgetter(1))
            sorted_x.reverse()
            sorted_x = sorted_x[:graphConstants.MAX_CLARITY_EDGE]
            total = sum([pair[1] for pair in sorted_x])
            for (idxsim, val) in sorted_x:
                prob = val / total
                if prob < 0.0:
                    break
                if recomm_nodename not in G.nodes():
                    G.add_node(recomm_nodename)
                    G.node[recomm_nodename]['type'] = graphConstants.TYPE_SUGG
                trainNode = trainFileNames[idxsim]
                if trainNode in G.nodes():
                    if prob < mini:
                        mini = prob
                    if prob > maxi:
                        maxi = prob
                    if G.has_edge(recomm_nodename, trainNode) is False:
                        G.add_edge(recomm_nodename,
                                   trainNode,
                                   weight=prob *
                                   graphConstants.CLARITY_EDGE_WEIGHT)
                    else:
                        G[recomm_nodename][trainNode][
                            'weight'] = G[recomm_nodename][trainNode][
                                'weight'] + prob * graphConstants.CLARITY_EDGE_WEIGHT

                    if G.has_edge(trainNode, recomm_nodename) is False:
                        G.add_edge(trainNode,
                                   recomm_nodename,
                                   weight=prob *
                                   graphConstants.CLARITY_EDGE_WEIGHT)
                    else:
                        G[trainNode][recomm_nodename][
                            'weight'] = G[trainNode][recomm_nodename][
                                'weight'] + prob * graphConstants.CLARITY_EDGE_WEIGHT

                    count = count + 1

        graphUtils.logger.info(
            "Simple  graph clarity completed for SuggestGoogle today. Stats follow"
        )
        graphUtils.logger.info("mini =" + str(mini))
        graphUtils.logger.info("maxi =" + str(maxi))
        graphUtils.logger.info("clarity edges count =" + str(count))
        nx.write_gexf(G, write_graph_file)
        graphUtils.saveSettings(graphConstants.LAST_GRAPH_SUGG_CLARITY_DIR,
                                todayDate)
        pass