コード例 #1
0
	def getAffectRatio(self, filename, text):
		# this could be provided as a feature, too
		self.wordsAffect = liwc()['Affect']
		count = 0
		for regex in self.wordsAffect:
			count += len(re.findall(regex + r"\b", text))
		poemFeatures["affectRatio"] = count * 1.0 / len(getWords(text))
コード例 #2
0
from nltk import corpus
from math import log
from random import sample

MIN_COMMENT_NUM = 10
COMMENT_DIR = "../data/comments_old/"
AFFECT_RATIO_DICT = "affect_ratio.p"
AFFECT_RATIO_PER_COMMENT_DICT = "affect_ratio2.p"
NRC_RATIO_DICT = "nrc_ratio.p"
NRC_FILE = '../data/NRC-lexicon.txt'
IGNORE_FILES = ["039", # someone added wikipedia articles as comments
		"411", "447","466" # lots of loves
	]

stopwordList = corpus.stopwords.words('english')
affectWordList = liwc()['Affect']

def makeRegexFromList(l):
	result = r"\b" + r"\b|\b".join(l) + r"\b"
	return re.sub("\.", "[a-z]", result)

def removeStopwords(text):
	stopwordRegex = makeRegexFromList(stopwordList)
	return re.sub(stopwordRegex, "", text)

def getWords(text):
	text = ''.join(text).strip()
	return re.findall("[\w']+", removeStopwords(text))

def getCommentFilenames():
	return [(f, COMMENT_DIR + f) for f in listdir(COMMENT_DIR) if isfile(COMMENT_DIR + f)]