def analyzeBlogs(blogList): # Analyze blog with tfidf, and other word analysis. outputWordsArr = [] namesCount, religionCount, weaponryCount, governmentCount, wordCount = 0, 0, 0, 0, 0 for i, blog in enumerate(blogList): scores = {} wordCount = 0 print("Top words in document {}".format(i + 1)) for word in blog.words: flag = True word = word.lower() # Everything is in lowercase. for punc in terms.punctuation(): if punc in word: flag = False wordCount+=1 if flag: scores[word] = tfidf.tfidf(word, blog, blogList) # run tfidf if word in terms.governmentTerms(): # increment count based on content to find word densities. governmentCount+=1 if word in terms.weaponsTerms(): weaponryCount+=1 if word in terms.femaleNames() or word in terms.maleNames(): namesCount+=1 if word in terms.religiousTerms(): religionCount+=1 sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) # sort the words for word, score in sorted_words[0:10]: print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5))) outputWordsArr.append((word, round(score, 10))) print("---------------------------------------------------------") # Gathering the density scores of each of these defined features, and creating the returning data type analysisOutputs = AnalysisObject(namesCount/wordCount,religionCount/wordCount,weaponryCount/wordCount,governmentCount/wordCount,outputWordsArr) return analysisOutputs
def analyzeNewBlog(blog, goodBlogList, badBlogList, features): # Get word densities of the new blog namesCount, religionCount, weaponryCount, governmentCount, wordCount = 0, 0, 0, 0, 0 for word in tb(blog): wordCount += 1 if word in terms.governmentTerms(): # increment count based on content to find word densities. governmentCount += 1 if word in terms.weaponsTerms(): weaponryCount += 1 if word in terms.femaleNames() or word in terms.maleNames(): namesCount += 1 if word in terms.religiousTerms(): religionCount += 1 analysisOutputs = AnalysisObject(namesCount/wordCount,religionCount/wordCount,weaponryCount/wordCount,governmentCount/wordCount, None) # Compare to the analyzed ones. scores = {"good": 0.0, "bad": 0.0} for upperKey in features: print ("\nComparing this blog to " + upperKey.upper() + " blogs:\n") for lowerKey in features[upperKey]: if lowerKey == "words": for word in features[upperKey][lowerKey]: if word[0] not in terms.stopWords(): if word[0] in blog: print ("Word found in " + upperKey + " blog: " + word[0]) scores[upperKey] += word[1] * 100 # If a word is found, update the score relative to its TFIDF score. elif lowerKey == "religion": # This next section is to compare the density of a term of the new blog compared to the density of that term in the analyzed blogs. scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.religionScore) print ("Religion variance: " + str(features[upperKey][lowerKey] - analysisOutputs.religionScore)) elif lowerKey == "government": scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.governmentScore) print ("Government variance: " + str(abs(features[upperKey][lowerKey] - analysisOutputs.governmentScore))) elif lowerKey == "weaponry": scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.weaponryScore) print ("Weaponry variance: " + str(abs(features[upperKey][lowerKey] - analysisOutputs.weaponryScore))) elif lowerKey == "names": scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.namesScore) print ("Names variance: " + str(abs(features[upperKey][lowerKey] - analysisOutputs.namesScore))) print ("\nFinal Scores:\n" + "Bad: " + str(scores["bad"]) + "\nGood: " + str(scores["good"]) + "\n") if abs(scores["good"] - scores["bad"]) < .5: print ("This post does not trend towards 'good; or 'bad'.") else: if scores["good"] > scores["bad"]: print ("This post has been marked as 'good'.") goodBlogList.append(tb(blog)) # Add term to the blog list. If this program were running constantly, it would be included in the next baes analysis. else: print ("This post has been flagged as 'bad'.") badBlogList.append(tb(blog)) print ("\n---------------------------------------")