def test_extract_words(): text = "This movie, while not great, was decent" words = preprocessing.extract_words(text) assert_equals(['this', 'movie', 'while', 'not', 'not_great', 'was', 'decent'], words) words = preprocessing.extract_words(text, use_negation=False) assert_equals(['this', 'movie', 'while', 'not', 'great', 'was', 'decent'], words)
def _add_example(self, cls, message): """Add a training example """ words = preprocessing.extract_words(message) if not words: # Treat cases where 'paper cut' is pre-processing as negatives. return # Update self.documents[n] and self.vocab[n] for ngrams and cls for n in (1,2,3): ngrams = preprocessing.get_ngrams(n, words) self.documents[n].append((cls,ngrams))
def _add_example(self, cls, message): """Add a training example """ words = preprocessing.extract_words(message) if not words: # Treat cases where 'paper cut' is pre-processing as negatives. return # Update self.documents[n] and self.vocab[n] for ngrams and cls for n in (1, 2, 3): ngrams = preprocessing.get_ngrams(n, words) self.documents[n].append((cls, ngrams))
def _add_example(self, cls, message): """Add a training example """ words = preprocessing.extract_words(message) if not words: return documents = self.pos_documents if cls else self.neg_documents for n in (1, 2, 3): ngrams = preprocessing.get_ngrams(n, words) documents[n].append(ngrams) for g in ngrams: self.vocab[n].add(g)
def _add_example(self, cls, message): """Add a training example """ words = preprocessing.extract_words(message) if not words: # Treat cases where 'paper cut' is pre-processing as negatives. return self.class_count[cls] += 1 # Update ngram_counts and ngram_keys for ngrams cls for n in (1,2,3): ngrams = preprocessing.get_ngrams(n, words) ngrams = BayesClassifier.get_features(ngrams) for g in ngrams: count = self.ngram_counts[n].get(g, [0,0]) count[cls] += 1 self.ngram_counts[n][g] = count self.ngram_keys[n].add(g)
def _add_example(self, cls, message): """Add a training example """ words = preprocessing.extract_words(message) if not words: # Treat cases where 'paper cut' is pre-processing as negatives. return self.class_count[cls] += 1 # Update ngram_counts and ngram_keys for ngrams cls for n in (1, 2, 3): ngrams = preprocessing.get_ngrams(n, words) ngrams = BayesClassifier.get_features(ngrams) for g in ngrams: count = self.ngram_counts[n].get(g, [0, 0]) count[cls] += 1 self.ngram_counts[n][g] = count self.ngram_keys[n].add(g)
def classify(self, message, detailed=False): """ 'message' is a string to classify. Return True or False classification. Method is to calculate a log_odds from a liklihood based on trigram, bigram and unigram (p,n) counts in the training set For each trigram return smoothed trigram score if trigram in training set, else for the 2 bigrams in the trigram return smoothed bigram score if bigram in training set, else for the 2 unigrams in the bigram return smoothed unigram score get_score() shows the smoothed scoring <n>gram_score() shows the backoff and smoothing factors """ words = preprocessing.extract_words(message) if not words: return False, 0.0 # Best intuition would be to compute back-off based on counts ngrams = dict( (n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3)) query_vecs = dict( (n, RocchioClassifier.get_query_vec(ngrams[n])) for n in (1, 2, 3)) weights = RocchioClassifier.get_weights() def get_weighted_distance(centroid): return sum( RocchioClassifier.get_distance(centroid[n], query_vecs[n]) * weights[n] for n in (1, 2, 3)) pos_distance = get_weighted_distance(self.pos_centroid) neg_distance = get_weighted_distance(self.neg_centroid) diff = (pos_distance + EPSILON) / (neg_distance + EPSILON) return diff > RocchioClassifier.threshold, math.log(diff)
def decide(text): dictionary = {} preprocessing.handle_file_text("demo", text, 3, dictionary) value_list = [] row_list = [] col_list = [] row = 0 for key in dictionary.keys(): dictionary[key] = Counter(dictionary[key]) lookup = {word: index for index, word in enumerate(word_list)} words = set(preprocessing.extract_words(text)) for word in words: if word in lookup: value_list.append(dictionary['demo'][word]) row_list.append(row) col_list.append(lookup[word]) row += 1 input_matrix = csr_matrix((value_list, (row_list, col_list)), shape=(1, len(word_list)), dtype=np.int8) return pipeline.predict(input_matrix)
def classify(self, message, detailed=False): """ 'message' is a string to classify. Return True or False classification. Method is to calculate a log_odds from a liklihood based on trigram, bigram and unigram (p,n) counts in the training set For each trigram return smoothed trigram score if trigram in training set, else for the 2 bigrams in the trigram return smoothed bigram score if bigram in training set, else for the 2 unigrams in the bigram return smoothed unigram score get_score() shows the smoothed scoring <n>gram_score() shows the backoff and smoothing factors """ words = preprocessing.extract_words(message) if not words: return False, 0.0 # Best intuition would be to compute back-off based on counts ngrams = dict((n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3)) query_vecs = dict((n, RocchioClassifier.get_query_vec(ngrams[n])) for n in (1, 2, 3)) weights = RocchioClassifier.get_weights() def get_weighted_distance(centroid): return sum(RocchioClassifier.get_distance(centroid[n], query_vecs[n]) * weights[n] for n in (1, 2, 3)) pos_distance = get_weighted_distance(self.pos_centroid) neg_distance = get_weighted_distance(self.neg_centroid) diff = (pos_distance + EPSILON) / (neg_distance + EPSILON) return diff > RocchioClassifier.threshold, math.log(diff)
def classify(self, message, detailed=False): """ 'message' is a string to classify. Return True or False classification. Method is to calculate a log_odds from a liklihood based on trigram, bigram and unigram (p,n) counts in the training set For each trigram return smoothed trigram score if trigram in training set, else for the 2 bigrams in the trigram return smoothed bigram score if bigram in training set, else for the 2 unigrams in the bigram return smoothed unigram score get_score() shows the smoothed scoring <n>gram_score() shows the backoff and smoothing factors """ def get_docs_with_terms(vectorizer, ngrams): docs = set() for term in ngrams: if term in vectorizer.vocab: docs |= set(vectorizer.tfidf[term].keys()) return docs def get_nearest(K, documents, vectorizer, ngrams, doc_ids): """Return doc ids of K documents nearest query_vec """ # Compute scores and add to a priority queue scores = [] for i in doc_ids: heapq.heappush(scores, (vectorizer.get_distance(i, ngrams), i, documents[i][0])) # Return top K scores return [(cls,i,dist) for dist,i,cls in heapq.nlargest(K,scores)] words = preprocessing.extract_words(message) if not words: return False, 0.0 ngrams = dict((n,preprocessing.get_ngrams(n, words)) for n in (1,2,3)) diffs = {} for n in (1,2,3): doc_ids = get_docs_with_terms(self.vectorizers[n], ngrams[n]) nearest = get_nearest(KnnClassifier.K, self.documents[n], self.vectorizers[n], ngrams[n], doc_ids ) pos = sum((1 if cls else -1) * (KnnClassifier.backoff ** k) for k,(cls,_,_) in enumerate(nearest)) max_pos = sum(KnnClassifier.backoff ** k for k in range(len(nearest))) # pos2/max_pos2 is in range [-1,+1] pos2 = sum((1 if cls else -1) * (KnnClassifier.backoff2 ** (2*k)) for k,(cls,_,_) in enumerate(nearest)) max_pos2 = sum(KnnClassifier.backoff2 ** (2*k) for k in range(len(nearest))) pos *= pos2 max_pos *= max_pos2 diffs[n] = pos/max_pos if max_pos else 0.0 weights = KnnClassifier.get_weights() diff = sum(diffs[n]*weights[n] for n in (1,2,3)) return diff > KnnClassifier.threshold, diff
def preprocessed(documents): for document in documents: document = clean(document) document = extract_words(document) yield document
def classify(self, message, detailed=False): """ 'message' is a string to classify. Return True or False classification. Method is to calculate a log_odds from a liklihood based on trigram, bigram and unigram (p,n) counts in the training set For each trigram return smoothed trigram score if trigram in training set, else for the 2 bigrams in the trigram return smoothed bigram score if bigram in training set, else for the 2 unigrams in the bigram return smoothed unigram score get_score() shows the smoothed scoring <n>gram_score() shows the backoff and smoothing factors """ def get_docs_with_terms(vectorizer, ngrams): docs = set() for term in ngrams: if term in vectorizer.vocab: docs |= set(vectorizer.tfidf[term].keys()) return docs def get_nearest(K, documents, vectorizer, ngrams, doc_ids): """Return doc ids of K documents nearest query_vec """ # Compute scores and add to a priority queue scores = [] for i in doc_ids: heapq.heappush( scores, (vectorizer.get_distance(i, ngrams), i, documents[i][0])) # Return top K scores return [(cls, i, dist) for dist, i, cls in heapq.nlargest(K, scores)] words = preprocessing.extract_words(message) if not words: return False, 0.0 ngrams = dict( (n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3)) diffs = {} for n in (1, 2, 3): doc_ids = get_docs_with_terms(self.vectorizers[n], ngrams[n]) nearest = get_nearest(KnnClassifier.K, self.documents[n], self.vectorizers[n], ngrams[n], doc_ids) pos = sum((1 if cls else -1) * (KnnClassifier.backoff**k) for k, (cls, _, _) in enumerate(nearest)) max_pos = sum(KnnClassifier.backoff**k for k in range(len(nearest))) # pos2/max_pos2 is in range [-1,+1] pos2 = sum((1 if cls else -1) * (KnnClassifier.backoff2**(2 * k)) for k, (cls, _, _) in enumerate(nearest)) max_pos2 = sum(KnnClassifier.backoff2**(2 * k) for k in range(len(nearest))) pos *= pos2 max_pos *= max_pos2 diffs[n] = pos / max_pos if max_pos else 0.0 weights = KnnClassifier.get_weights() diff = sum(diffs[n] * weights[n] for n in (1, 2, 3)) return diff > KnnClassifier.threshold, diff
def classify(self, message, detailed=False): """message is a string to classify. Return True or False classification. Method is to calculate a log_odds from a liklihood based on trigram, bigram and unigram (pos,neg) counts in the training set For each trigram return smoothed trigram score if trigram in training set, else for the 2 bigrams in the trigram return smoothed bigram score if bigram in training set, else for the 2 unigrams in the bigram return smoothed unigram score get_score() shows the smoothed scoring gram_score() shows the backoff and smoothing factors """ words = preprocessing.extract_words(message) if detailed: print words if not words: return False, 0.0 # Using dicts with offset keys prevents the same ngram being included twice ngrams = dict((n,{}) for n in (1,2,3)) from preprocessing import WORD_DELIMITER # Best intuition is to compute back-off based on counts for i in range(len(words)-3): tri = WORD_DELIMITER.join(words[i:i+3]) if tri in self.ngram_keys[3]: ngrams[3][i] = tri else: for j in (0,1): bi = WORD_DELIMITER.join(words[i+j:i+j+2]) if bi in self.ngram_keys[2]: ngrams[2][i+j] = bi else: for k in (0,1): ngrams[1][i+j+k] = words[i+j+k] for n in (1,2,3): ngrams[n] = BayesClassifier.get_features(ngrams[n].values()) def get_score(counts, cntv, alpha): """Get a smoothed score for an ngram counts = neg,pos neg = number of negatives for ngram in training set pos = number of positives for ngram in training set cntv = (cntn,cntp,v) for an ngram training dict cntn: total number of negatives cntp: total number of positives v: number of unique ngrams alpha: smoothing factor. Returns: a smoothed score for the ngram """ neg,pos = counts if neg == pos: return 0 cntn,cntp,v = cntv return math.log((pos+alpha)/(cntp+v*alpha)) - math.log((neg+alpha)/(cntn+v*alpha)) if detailed: def _dbg(n, score, g): print '%d%s [%.2f] %s' % (n, ' ' * (3-n), score, g) else: def _dbg(n, score, g): pass weights = BayesClassifier.get_weights() smoothings = BayesClassifier.get_smoothings() def ngram_score(n, g): score = get_score(self.ngram_counts[n].get(g, [0,0]), self.cntv_ngrams[n], smoothings[n]) _dbg(n, score, g) return score neg,pos = self.class_count prior = math.log(pos) - math.log(neg) likelihood = sum(weights[n] * sum(ngram_score(n,g) for g in ngrams[n]) for n in (1,2,3)) log_odds = prior + likelihood if detailed: n_gram_dict = {} for n in (1,2,3): for g in ngrams[n]: n_gram_dict[g] = ngram_score(n,g) * weights[n] print 'ngrams scores --------------' for k in sorted(n_gram_dict, key = lambda x: n_gram_dict[x]): print '%6.3f : %s ' % (n_gram_dict[k], k) return log_odds > BayesClassifier.threshold, log_odds
def classify(self, message, detailed=False): """message is a string to classify. Return True or False classification. Method is to calculate a log_odds from a liklihood based on trigram, bigram and unigram (pos,neg) counts in the training set For each trigram return smoothed trigram score if trigram in training set, else for the 2 bigrams in the trigram return smoothed bigram score if bigram in training set, else for the 2 unigrams in the bigram return smoothed unigram score get_score() shows the smoothed scoring gram_score() shows the backoff and smoothing factors """ words = preprocessing.extract_words(message) if detailed: print words if not words: return False, 0.0 # Using dicts with offset keys prevents the same ngram being included twice ngrams = dict((n, {}) for n in (1, 2, 3)) from preprocessing import WORD_DELIMITER # Best intuition is to compute back-off based on counts for i in range(len(words) - 3): tri = WORD_DELIMITER.join(words[i : i + 3]) if tri in self.ngram_keys[3]: ngrams[3][i] = tri else: for j in (0, 1): bi = WORD_DELIMITER.join(words[i + j : i + j + 2]) if bi in self.ngram_keys[2]: ngrams[2][i + j] = bi else: for k in (0, 1): ngrams[1][i + j + k] = words[i + j + k] for n in (1, 2, 3): ngrams[n] = BayesClassifier.get_features(ngrams[n].values()) def get_score(counts, cntv, alpha): """Get a smoothed score for an ngram counts = neg,pos neg = number of negatives for ngram in training set pos = number of positives for ngram in training set cntv = (cntn,cntp,v) for an ngram training dict cntn: total number of negatives cntp: total number of positives v: number of unique ngrams alpha: smoothing factor. Returns: a smoothed score for the ngram """ neg, pos = counts if neg == pos: return 0 cntn, cntp, v = cntv return math.log((pos + alpha) / (cntp + v * alpha)) - math.log((neg + alpha) / (cntn + v * alpha)) if detailed: def _dbg(n, score, g): print "%d%s [%.2f] %s" % (n, " " * (3 - n), score, g) else: def _dbg(n, score, g): pass weights = BayesClassifier.get_weights() smoothings = BayesClassifier.get_smoothings() def ngram_score(n, g): score = get_score(self.ngram_counts[n].get(g, [0, 0]), self.cntv_ngrams[n], smoothings[n]) _dbg(n, score, g) return score neg, pos = self.class_count prior = math.log(pos) - math.log(neg) likelihood = sum(weights[n] * sum(ngram_score(n, g) for g in ngrams[n]) for n in (1, 2, 3)) log_odds = prior + likelihood if detailed: n_gram_dict = {} for n in (1, 2, 3): for g in ngrams[n]: n_gram_dict[g] = ngram_score(n, g) * weights[n] print "ngrams scores --------------" for k in sorted(n_gram_dict, key=lambda x: n_gram_dict[x]): print "%6.3f : %s " % (n_gram_dict[k], k) return log_odds > BayesClassifier.threshold, log_odds