コード例 #1
0
def analyzeBlogs(blogList): # Analyze blog with tfidf, and other word analysis. 
    outputWordsArr  = []
    namesCount, religionCount, weaponryCount, governmentCount, wordCount = 0, 0, 0, 0, 0
    for i, blog in enumerate(blogList):
        scores = {}
        wordCount = 0
        print("Top words in document {}".format(i + 1))
        for word in blog.words:
            flag = True
            word = word.lower() # Everything is in lowercase. 
            for punc in terms.punctuation():
                if punc in word:
                    flag = False
            wordCount+=1
            if flag:  
                scores[word] = tfidf.tfidf(word, blog, blogList) # run tfidf
                if word in terms.governmentTerms(): # increment count based on content to find word densities. 
                    governmentCount+=1
                if word in terms.weaponsTerms():
                    weaponryCount+=1
                if word in terms.femaleNames() or word in terms.maleNames():
                    namesCount+=1
                if word in terms.religiousTerms():
                    religionCount+=1
                sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) # sort the words
        for word, score in sorted_words[0:10]:
            print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
            outputWordsArr.append((word, round(score, 10)))
        print("---------------------------------------------------------")
    # Gathering the density scores of each of these defined features, and creating the returning data type
    analysisOutputs = AnalysisObject(namesCount/wordCount,religionCount/wordCount,weaponryCount/wordCount,governmentCount/wordCount,outputWordsArr)
    return analysisOutputs
コード例 #2
0
def analyzeNewBlog(blog, goodBlogList, badBlogList, features):
    # Get word densities of the new blog
    namesCount, religionCount, weaponryCount, governmentCount, wordCount = 0, 0, 0, 0, 0
    for word in tb(blog):
        wordCount += 1
        if word in terms.governmentTerms(): # increment count based on content to find word densities. 
            governmentCount += 1
        if word in terms.weaponsTerms():
            weaponryCount += 1
        if word in terms.femaleNames() or word in terms.maleNames():
            namesCount += 1
        if word in terms.religiousTerms():
            religionCount += 1
    analysisOutputs = AnalysisObject(namesCount/wordCount,religionCount/wordCount,weaponryCount/wordCount,governmentCount/wordCount, None)
   
   # Compare to the analyzed ones.
    scores = {"good": 0.0, "bad": 0.0}
    for upperKey in features:
        print ("\nComparing this blog to " + upperKey.upper() + " blogs:\n")
        for lowerKey in features[upperKey]:
            if lowerKey == "words":
                for word in features[upperKey][lowerKey]:
                    if word[0] not in terms.stopWords():
                        if word[0] in blog:
                            print ("Word found in " + upperKey + " blog: " + word[0])
                            scores[upperKey] += word[1] * 100 # If a word is found, update the score relative to its TFIDF score. 
            elif lowerKey == "religion": # This next section is to compare the density of a term of the new blog compared to the density of that term in the analyzed blogs. 
                scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.religionScore)
                print ("Religion variance: " + str(features[upperKey][lowerKey] - analysisOutputs.religionScore))
            elif lowerKey == "government":
                scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.governmentScore)
                print ("Government variance: " + str(abs(features[upperKey][lowerKey] - analysisOutputs.governmentScore)))
            elif lowerKey == "weaponry":
                scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.weaponryScore)
                print ("Weaponry variance: " + str(abs(features[upperKey][lowerKey] - analysisOutputs.weaponryScore)))
            elif lowerKey == "names":
                scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.namesScore)
                print ("Names variance: " + str(abs(features[upperKey][lowerKey] - analysisOutputs.namesScore)))
    print ("\nFinal Scores:\n" + "Bad: " + str(scores["bad"]) + "\nGood: " + str(scores["good"]) + "\n")
    if abs(scores["good"] - scores["bad"]) < .5:
        print ("This post does not trend towards 'good; or 'bad'.")
    else:
        if scores["good"] > scores["bad"]:
            print ("This post has been marked as 'good'.")
            goodBlogList.append(tb(blog)) # Add term to the blog list. If this program were running constantly, it would be included in the next baes analysis.
        else: 
            print ("This post has been flagged as 'bad'.")
            badBlogList.append(tb(blog))
    print ("\n---------------------------------------")