Exemple #1
0
def PPR():
    todayDate = graphUtils.getTodayDateFolder()
    lastRecommendationnDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPH_RECOMM_DONE)
    #lastRecommendationnDate = None
    if todayDate == lastRecommendationnDate:
        graphUtils.logger.info(
            "Simple Graph recommendation PPR done for today ")
        return
    graphUtils.logger.info("Simple graph recommendation PPR last done for =" +
                           str(lastRecommendationnDate))
    #Get the current version of stored graphs
    G = None
    graph_path = os.path.join(graphConstants.ROOT_FOLDER,
                              graphConstants.GRAPH_DIR,
                              graphConstants.GRAPH_DIR,
                              graphConstants.TYPE_MAIN)
    graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    G = nx.read_gexf(graph_file)

    list_nodes = {x: i for i, x in enumerate(G.nodes())}
    R = get_init_R(G, list_nodes)

    #Normalize edge transition weights
    M = normalize_edge_Weights(list_nodes, G)

    S, list_seednode_names = graphSeedNodes.findSeedNodes(G, list_nodes)
    for idx, node in enumerate(list_seednode_names):
        graphUtils.logger.info(
            str(idx) + " seed node for simple graph today = " + node)
    newR = personalizedPageRank(R, M, S)
    printGraphRecommendedDocs(G, list_nodes, newR)
    writeNewR(G, list_nodes, newR, graph_file)
    graphUtils.saveSettings(graphConstants.LAST_GRAPH_RECOMM_DONE, todayDate)
    pass
Exemple #2
0
def printGraphRecommendedDocs(G, list_nodes, R):
    todayDateFolder = graphUtils.getTodayDateFolder()
    jsonData = readLinksJson(todayDateFolder)
    if jsonData is None:
        return False

    result = False
    jsonData['GoogleNews'][Constants.NERGRAPH] = []
    recommInfo = {}
    graphDocs = {}

    googleLinks = jsonData['GoogleNews'][Constants.GOOGLE]
    for linkObj in googleLinks:
        download = linkObj['download']
        htmlFile = graphConstants.TYPE_GOOGLE + "_" + linkObj[
            'id'] + "_" + todayDateFolder
        if download == "yes" and htmlFile in list_nodes:
            recommInfo[htmlFile] = linkObj
            htmlFile_idx = list_nodes[htmlFile]
            graphDocs[htmlFile] = R[htmlFile_idx]
    try:
        sorted_x = sorted(graphDocs.items(), key=operator.itemgetter(1))
        sorted_x.reverse()
        write_directory = os.path.join(graphConstants.ROOT_FOLDER,
                                       graphConstants.FINAL_DIR,
                                       todayDateFolder)
        if not os.path.exists(write_directory):
            os.makedirs(write_directory)
        outfile = open(
            os.path.join(write_directory, graphConstants.ULTIMATE_FILE), 'w')

        json_write = {}
        count = 1
        for (key, val) in sorted_x:
            if key in recommInfo:
                linkObj = recommInfo[key]
                linkObj['rank'] = -1
                jsonData['GoogleNews'][Constants.NERGRAPH].append(linkObj)
                count = count + 1
                if count >= graphConstants.RECOMMENDED_LINKS:
                    break
            else:
                graphUtils.logger.error(
                    "NER Graph normalGoogle key not found = " + key)
        json.dump(jsonData, outfile)
        outfile.close()
        result = True
    except Exception, e:
        graphUtils.logger.error("Exception = %s" % e)
        graphUtils.logger.error(
            "Exception at writing final Graph Recommendation docs for data : %s"
            % write_directory)
Exemple #3
0
def Relevance():
    todayDate = graphUtils.getTodayDateFolder()
    lastRelevanceDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPH_RELEVANCE_DIR)
    lastSuggRelevanceDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPH_SUGG_RELEVANCE_DIR)

    if lastRelevanceDate:
        graphUtils.logger.info("Graph Relevance done last for =" +
                               lastRelevanceDate)
    else:
        graphUtils.logger.info("Graph Relevance done last for None")

    if lastSuggRelevanceDate:
        graphUtils.logger.info("GraphSugg Relevance done last for =" +
                               lastSuggRelevanceDate)
    else:
        graphUtils.logger.info("GraphSugg Relevance done last for None")

    if todayDate == lastRelevanceDate and todayDate == lastSuggRelevanceDate:
        graphUtils.logger.info(
            "Graph Relevance signal already done for today :" + todayDate)
        return True
    graph_path = os.path.join(graphConstants.ROOT_FOLDER,
                              graphConstants.GRAPH_DIR,
                              graphConstants.GRAPH_DIR,
                              graphConstants.TYPE_MAIN)
    graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    write_graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    if not os.path.exists(graph_path):
        os.makedirs(graph_path)
    G = nx.read_gexf(graph_file)
    trainFiles, trainFileNames = graphUtils.findRecommTrainGraphFiles()
    trainCorpus = graphUtils.findCorpus(trainFiles)
    all_tokens = sum(trainCorpus, [])
    tokens_once = set(word for word in set(all_tokens)
                      if all_tokens.count(word) == 1)
    texts = [[word for word in text if word not in tokens_once]
             for text in trainCorpus]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(corpus=corpus,
                              id2word=dictionary,
                              normalize=True)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                num_features=len(dictionary))

    if todayDate != lastRelevanceDate:
        testFiles, testFileName = graphUtils.findRecommFiles()
        testCorpus = graphUtils.findCorpus(testFiles)
        mini = 100
        maxi = -1
        count = 0
        for idx, text in enumerate(testCorpus):
            #Add this recommendation node
            recomm_nodename = testFileName[idx]
            if recomm_nodename not in G.nodes():
                G.add_node(recomm_nodename)
                G.node[recomm_nodename]['type'] = graphConstants.TYPE_GOOGLE
            vec = dictionary.doc2bow(text)
            sims = index[tfidf[vec]]
            for idxsim, prob in enumerate(sims):
                if prob < 0.1:
                    continue
                trainNode = trainFileNames[idxsim]
                if trainNode in G.nodes():
                    if prob < mini:
                        mini = prob
                    if prob > maxi:
                        maxi = prob
                    G.add_edge(recomm_nodename, trainNode, weight=prob)
                    G.add_edge(trainNode, recomm_nodename, weight=prob)
                    count = count + 1
            text = readFromFile(testFiles[idx])
            #NERFunc(text,G, recomm_nodename)
        graphUtils.logger.info(
            "Simple graph relevance completed for today. Stats follow")
        graphUtils.logger.info("mini =" + str(mini))
        graphUtils.logger.info("maxi =" + str(maxi))
        graphUtils.logger.info("Relevance count =" + str(count))
        nx.write_gexf(G, write_graph_file)
        graphUtils.saveSettings(graphConstants.LAST_GRAPH_RELEVANCE_DIR,
                                todayDate)

    if todayDate != lastRelevanceDate:
        testFiles, testFileName = graphUtils.findSuggRecommFiles()
        testCorpus = graphUtils.findCorpus(testFiles)
        mini = 100
        maxi = -1
        count = 0
        for idx, text in enumerate(testCorpus):
            #Add this recommendation node
            recomm_nodename = testFileName[idx]
            if recomm_nodename not in G.nodes():
                G.add_node(recomm_nodename)
                G.node[recomm_nodename]['type'] = graphConstants.TYPE_SUGG
            vec = dictionary.doc2bow(text)
            sims = index[tfidf[vec]]
            for idxsim, prob in enumerate(sims):
                if prob < 0.1:
                    continue
                trainNode = trainFileNames[idxsim]
                if trainNode in G.nodes():
                    if prob < mini:
                        mini = prob
                    if prob > maxi:
                        maxi = prob
                    G.add_edge(recomm_nodename, trainNode, weight=prob)
                    G.add_edge(trainNode, recomm_nodename, weight=prob)
                    count = count + 1
            text = readFromFile(testFiles[idx])
            #NERFunc(text,G, recomm_nodename)
        graphUtils.logger.info(
            "Simple graph relevance completed for suggestGoogle today. Stats follow"
        )
        graphUtils.logger.info("mini =" + str(mini))
        graphUtils.logger.info("maxi =" + str(maxi))
        graphUtils.logger.info("Relevance count =" + str(count))
        nx.write_gexf(G, write_graph_file)
        graphUtils.saveSettings(graphConstants.LAST_GRAPH_SUGG_RELEVANCE_DIR,
                                todayDate)
    pass
Exemple #4
0
def Smoothness():
    todayDate = graphUtils.getTodayDateFolder()
    lastSmoothnessDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPHNER_SMOOTHNESS_DIR)
    lastSuggSmoothnessDate = graphUtils.loadSettings(
        graphConstants.LAST_GRAPHNER_SUGG_SMOOTHNESS_DIR)

    if lastSmoothnessDate:
        graphUtils.logger.info("NERGraph Smoothness done last for =" +
                               lastSmoothnessDate)
    else:
        graphUtils.logger.info("NERGraph Smoothness done last for None")

    if lastSuggSmoothnessDate:
        graphUtils.logger.info("NERGraphSugg Smoothness done last for =" +
                               lastSuggSmoothnessDate)
    else:
        graphUtils.logger.info("NERGraphSugg Smoothness done last for None")

    if todayDate == lastSmoothnessDate and todayDate == lastSuggSmoothnessDate:
        graphUtils.logger.info(
            "NERGraph Smoothness signal already done for today :" + todayDate)
        return True
    graph_path = os.path.join(graphConstants.ROOT_FOLDER,
                              graphConstants.GRAPH_DIR,
                              graphConstants.GRAPH_DIR,
                              graphConstants.TYPE_NER)
    graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    write_graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE)
    if not os.path.exists(graph_path):
        os.makedirs(graph_path)
    G = nx.read_gexf(graph_file)
    trainFiles, trainFileNames = graphUtils.findRecommTrainGraphNerFiles()
    trainCorpus = graphUtils.findCorpus(trainFiles)
    bm25obj = Bm25(trainCorpus)
    trainUniqueWords = []
    for trainText in trainCorpus:
        trainUniqueWords.append(set(trainText))

    if todayDate != lastSmoothnessDate:
        testFiles, testFileName = graphUtils.findRecommFiles()
        testCorpus = graphUtils.findCorpus(testFiles)
        testUniqueWords = []
        mini = 100
        maxi = -1
        count = 0
        smoothness = zeros((len(testCorpus), len(trainCorpus)))
        for testText in testCorpus:
            testUniqueWords.append(set(testText))
        for testDoc in range(len(testCorpus)):
            recomm_nodename = testFileName[testDoc]
            uniqueTest = testUniqueWords[testDoc]
            SminusDcontext = zeros(bm25obj.N)
            DminusScontext = zeros(bm25obj.N)
            for trainDoc in range(len(trainCorpus)):
                uniqueTrain = trainUniqueWords[trainDoc]
                SminusD = [
                    word for word in trainCorpus[trainDoc]
                    if word not in uniqueTest
                ]
                DminusS = [
                    word for word in testCorpus[testDoc]
                    if word not in uniqueTrain
                ]
                SminusDcontext = bm25obj.BM25Score(SminusD)
                DminusScontext = bm25obj.BM25Score(DminusS)
                smoothness[testDoc][trainDoc] = np.dot(SminusDcontext,
                                                       DminusScontext)
            dict_arr = {
                key: value
                for (key, value) in enumerate(smoothness[testDoc])
            }
            sorted_x = sorted(dict_arr.items(), key=operator.itemgetter(1))
            sorted_x.reverse()
            sorted_x = sorted_x[:graphConstants.MAX_SMOOTHNESS_EDGE]
            total = sum([pair[1] for pair in sorted_x])
            for (idxsim, val) in sorted_x:
                prob = val / total
                if recomm_nodename not in G.nodes():
                    G.add_node(recomm_nodename)
                    G.node[recomm_nodename][
                        'type'] = graphConstants.TYPE_GOOGLE
                trainNode = trainFileNames[idxsim]
                if trainNode in G.nodes():
                    if prob < mini:
                        mini = prob
                    if prob > maxi:
                        maxi = prob
                    if G.has_edge(recomm_nodename, trainNode) is False:
                        G.add_edge(recomm_nodename,
                                   trainNode,
                                   weight=prob *
                                   graphConstants.SMOOTHNESS_EDGE_WEIGHT)
                    else:
                        G[recomm_nodename][trainNode][
                            'weight'] = G[recomm_nodename][trainNode][
                                'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT

                    if G.has_edge(trainNode, recomm_nodename) is False:
                        G.add_edge(trainNode,
                                   recomm_nodename,
                                   weight=prob *
                                   graphConstants.SMOOTHNESS_EDGE_WEIGHT)
                    else:
                        G[trainNode][recomm_nodename][
                            'weight'] = G[trainNode][recomm_nodename][
                                'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT
                    count = count + 1

            #print smoothness[testDoc]
        graphUtils.logger.info(
            " ner graph Smoothness completed for normalGoogle today. Stats follow"
        )
        graphUtils.logger.info("mini =" + str(mini))
        graphUtils.logger.info("maxi =" + str(maxi))
        graphUtils.logger.info("Smoothness edges count =" + str(count))
        nx.write_gexf(G, write_graph_file)
        graphUtils.saveSettings(graphConstants.LAST_GRAPHNER_SMOOTHNESS_DIR,
                                todayDate)
        pass

    if todayDate != lastSuggSmoothnessDate:
        testFiles, testFileName = graphUtils.findSuggRecommFiles()
        testCorpus = graphUtils.findCorpus(testFiles)
        testUniqueWords = []
        mini = 100
        maxi = -1
        count = 0
        smoothness = zeros((len(testCorpus), len(trainCorpus)))
        for testText in testCorpus:
            testUniqueWords.append(set(testText))
        for testDoc in range(len(testCorpus)):
            recomm_nodename = testFileName[testDoc]
            uniqueTest = testUniqueWords[testDoc]
            SminusDcontext = zeros(bm25obj.N)
            DminusScontext = zeros(bm25obj.N)
            for trainDoc in range(len(trainCorpus)):
                uniqueTrain = trainUniqueWords[trainDoc]
                SminusD = [
                    word for word in trainCorpus[trainDoc]
                    if word not in uniqueTest
                ]
                DminusS = [
                    word for word in testCorpus[testDoc]
                    if word not in uniqueTrain
                ]
                SminusDcontext = bm25obj.BM25Score(SminusD)
                DminusScontext = bm25obj.BM25Score(DminusS)
                smoothness[testDoc][trainDoc] = np.dot(SminusDcontext,
                                                       DminusScontext)
            dict_arr = {
                key: value
                for (key, value) in enumerate(smoothness[testDoc])
            }
            sorted_x = sorted(dict_arr.items(), key=operator.itemgetter(1))
            sorted_x.reverse()
            sorted_x = sorted_x[:graphConstants.MAX_SMOOTHNESS_EDGE]
            total = sum([pair[1] for pair in sorted_x])
            for (idxsim, val) in sorted_x:
                prob = val / total
                if recomm_nodename not in G.nodes():
                    G.add_node(recomm_nodename)
                    G.node[recomm_nodename]['type'] = graphConstants.TYPE_SUGG
                trainNode = trainFileNames[idxsim]
                if trainNode in G.nodes():
                    if prob < mini:
                        mini = prob
                    if prob > maxi:
                        maxi = prob
                    if G.has_edge(recomm_nodename, trainNode) is False:
                        G.add_edge(recomm_nodename,
                                   trainNode,
                                   weight=prob *
                                   graphConstants.SMOOTHNESS_EDGE_WEIGHT)
                    else:
                        G[recomm_nodename][trainNode][
                            'weight'] = G[recomm_nodename][trainNode][
                                'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT

                    if G.has_edge(trainNode, recomm_nodename) is False:
                        G.add_edge(trainNode,
                                   recomm_nodename,
                                   weight=prob *
                                   graphConstants.SMOOTHNESS_EDGE_WEIGHT)
                    else:
                        G[trainNode][recomm_nodename][
                            'weight'] = G[trainNode][recomm_nodename][
                                'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT
                    count = count + 1

            #print smoothness[testDoc]
        graphUtils.logger.info(
            " ner graph Smoothness completed for suggestGoogle today. Stats follow"
        )
        graphUtils.logger.info("mini =" + str(mini))
        graphUtils.logger.info("maxi =" + str(maxi))
        graphUtils.logger.info("Smoothness edges count =" + str(count))
        nx.write_gexf(G, write_graph_file)
        graphUtils.saveSettings(
            graphConstants.LAST_GRAPHNER_SUGG_SMOOTHNESS_DIR, todayDate)
        pass