def tokenize(self, sentence): sentence = self.preprocess(sentence) if sentence not in Feature.tokens_cache: tokenizer = Tokenizer() #Feature.tokens_cache[sentence] = [Feature.checker.correct(t) for t in tokenizer.tokenize(sentence)] Feature.tokens_cache[sentence] = tokenizer.tokenize(sentence) return Feature.tokens_cache[sentence]
def __init__(self,contextsize=100): self._contextsize = contextsize self._pwords = get_opinion_words(positive="True") self._nwords = get_opinion_words(positive="False") self._soscores = {} self._tokenizer = Tokenizer() self._initialized = False
def BuildWordWordDict(sentences,context_size): "Build probability dictionary for unigrams and word-pairs based on co-occurrence in context window" word_pairs_prob = defaultdict(float) unigrams_prob = defaultdict(float) numReviews = len(sentences) tokenizer = Tokenizer() for s in sentences : words = tokenizer.tokenize(s.lower().strip()) reverse = list(words) reverse.reverse() pairs_found = [] for i in range(len(words)) : w1 = words[i] wr1 = reverse[i] for j in range(i+1,len(words)): w2 = words[j] wr2 = reverse[j] if abs(i-j) > context_size: continue #to capture right context pairs if w1 < w2 : pairs_found.append((w1,w2)) else: pairs_found.append((w2,w1)) #to capture left context pairs if wr1 < wr2 : pairs_found.append((wr1,wr2)) else: pairs_found.append((wr2,wr1)) for pair in set(pairs_found): word_pairs_prob[pair] += 1 for unigram in set(words): unigrams_prob[unigram] += 1 unigrams_prob = dict(map(lambda (k,v): (k,(v * 1.0)/numReviews), unigrams_prob.items())) word_pairs_prob = dict(map(lambda (k,v): (k,(v * 1.0)/numReviews), word_pairs_prob.items())) #for (k,v) in word_pairs_prob.items(): # print k , "::" , v #for (k,v) in unigrams_prob.items(): # print k,":" , v return (unigrams_prob,word_pairs_prob)
class SentimentOrientation(Feature): def name(self): return "SemanticOrientation" + self._contextsize def __init__(self,contextsize=100): self._contextsize = contextsize self._pwords = get_opinion_words(positive="True") self._nwords = get_opinion_words(positive="False") self._soscores = {} self._tokenizer = Tokenizer() self._initialized = False def extract_all(self, sentences,labels): sentences = self.preprocess_all(sentences) if not self._initialized: self._soscores = sentimentorientation.getSOAScores(sentences,self._pwords, self._nwords, self._contextsize) self._initialized = True all_values = [] neg_values = [] #for debug pos_values = [] #ditto if not labels: labels = [0] * len(sentences) labels[0] = 1 samples = zip(sentences,labels) for sentence,label in samples: score = 0.0 words = self._tokenizer.tokenize(sentence) tagged = nltk.pos_tag(words) for word,tag in tagged: if not (tag.startswith("JJ") or tag.startswith("RB") or tag.startswith("NN") or tag.startswith("VB")): continue if word in self._soscores: score += self._soscores[word] #print sentence ,":" ,str(score) all_values.append([score]) if label == 0: neg_values.append(score) else: pos_values.append(score) print scipy.sparse.coo_matrix(all_values).shape print min(all_values), ":" , max(all_values) print min(pos_values), "-pos+" , max(pos_values) print min(neg_values), "-neg+" , max(neg_values) return scipy.sparse.coo_matrix(all_values)
def tokenize(self, sentence): sentence = self.preprocess(sentence) if sentence not in Feature.tokens_cache: tokenizer = Tokenizer() Feature.tokens_cache[sentence] = tokenizer.tokenize(sentence) return Feature.tokens_cache[sentence]