Exemple #1
0
	def tokenize(self, sentence):
		sentence = self.preprocess(sentence)
		if sentence not in Feature.tokens_cache:
			tokenizer = Tokenizer()
			#Feature.tokens_cache[sentence] = [Feature.checker.correct(t) for t in tokenizer.tokenize(sentence)]
			Feature.tokens_cache[sentence] = tokenizer.tokenize(sentence)
		return Feature.tokens_cache[sentence]
Exemple #2
0
	def __init__(self,contextsize=100): 
		self._contextsize = contextsize
		self._pwords = get_opinion_words(positive="True")
		self._nwords = get_opinion_words(positive="False")
		self._soscores = {}
		self._tokenizer = Tokenizer()
		self._initialized = False
def BuildWordWordDict(sentences,context_size):
	"Build probability dictionary for unigrams and word-pairs based on co-occurrence in context window"
	word_pairs_prob = defaultdict(float)
	unigrams_prob = defaultdict(float)
	numReviews = len(sentences)
	tokenizer = Tokenizer() 
	for s in sentences : 
		words = tokenizer.tokenize(s.lower().strip()) 
		reverse = list(words)
		reverse.reverse() 
		pairs_found = []
 		for i in range(len(words)) : 
			w1 = words[i]
			wr1 = reverse[i]
			for j in range(i+1,len(words)): 
				w2 = words[j]
				wr2 = reverse[j]
				if abs(i-j) > context_size: 
					 continue
			    #to capture right context pairs
				if w1 < w2 :
					pairs_found.append((w1,w2))
				else:
					pairs_found.append((w2,w1))
				#to capture left context pairs
				if wr1 < wr2 :
					pairs_found.append((wr1,wr2))
				else:
					pairs_found.append((wr2,wr1))
		for pair in set(pairs_found): 
			word_pairs_prob[pair] += 1
		for unigram in set(words):	
			unigrams_prob[unigram] += 1

	unigrams_prob = dict(map(lambda (k,v): (k,(v * 1.0)/numReviews), unigrams_prob.items()))
	word_pairs_prob = dict(map(lambda (k,v): (k,(v * 1.0)/numReviews), word_pairs_prob.items()))
	 
	#for (k,v) in word_pairs_prob.items():
	#	print k , "::" , v
	#for (k,v) in unigrams_prob.items():
	#	print k,":" , v
	return (unigrams_prob,word_pairs_prob)
Exemple #4
0
class SentimentOrientation(Feature): 
	def name(self): 
		return "SemanticOrientation" + self._contextsize 

	def __init__(self,contextsize=100): 
		self._contextsize = contextsize
		self._pwords = get_opinion_words(positive="True")
		self._nwords = get_opinion_words(positive="False")
		self._soscores = {}
		self._tokenizer = Tokenizer()
		self._initialized = False

	def extract_all(self, sentences,labels):
		sentences = self.preprocess_all(sentences)
		if not self._initialized:
			self._soscores = sentimentorientation.getSOAScores(sentences,self._pwords, self._nwords, self._contextsize) 
			self._initialized = True
		all_values = []
		neg_values = []  #for debug
		pos_values = []  #ditto
		if not labels: 
		    labels = [0] * len(sentences)
		    labels[0] = 1
		samples = zip(sentences,labels)
		for sentence,label in samples:
		    score = 0.0
		    words = self._tokenizer.tokenize(sentence)
		    tagged = nltk.pos_tag(words)
		    for word,tag in tagged: 
		    	if not (tag.startswith("JJ") or tag.startswith("RB") or tag.startswith("NN") or tag.startswith("VB")):
		    		continue
		    	if word in self._soscores: 
					score += self._soscores[word]
		    #print sentence ,":" ,str(score)
		    all_values.append([score])
		    if label == 0: 
		    	neg_values.append(score)
		    else: 
		    	pos_values.append(score)
		print scipy.sparse.coo_matrix(all_values).shape
		print min(all_values), ":" , max(all_values)
		print min(pos_values), "-pos+" , max(pos_values)
		print min(neg_values), "-neg+" , max(neg_values)
		return scipy.sparse.coo_matrix(all_values)
Exemple #5
0
	def tokenize(self, sentence):
		sentence = self.preprocess(sentence)
		if sentence not in Feature.tokens_cache:
			tokenizer = Tokenizer()
			Feature.tokens_cache[sentence] = tokenizer.tokenize(sentence)
		return Feature.tokens_cache[sentence]