def _add_example(self, cls, message): """Add a training example """ words = preprocessing.extract_words(message) if not words: # Treat cases where 'paper cut' is pre-processing as negatives. return # Update self.documents[n] and self.vocab[n] for ngrams and cls for n in (1,2,3): ngrams = preprocessing.get_ngrams(n, words) self.documents[n].append((cls,ngrams))
def _add_example(self, cls, message): """Add a training example """ words = preprocessing.extract_words(message) if not words: # Treat cases where 'paper cut' is pre-processing as negatives. return # Update self.documents[n] and self.vocab[n] for ngrams and cls for n in (1, 2, 3): ngrams = preprocessing.get_ngrams(n, words) self.documents[n].append((cls, ngrams))
def _add_example(self, cls, message): """Add a training example """ words = preprocessing.extract_words(message) if not words: return documents = self.pos_documents if cls else self.neg_documents for n in (1, 2, 3): ngrams = preprocessing.get_ngrams(n, words) documents[n].append(ngrams) for g in ngrams: self.vocab[n].add(g)
def _add_example(self, cls, message): """Add a training example """ words = preprocessing.extract_words(message) if not words: # Treat cases where 'paper cut' is pre-processing as negatives. return self.class_count[cls] += 1 # Update ngram_counts and ngram_keys for ngrams cls for n in (1,2,3): ngrams = preprocessing.get_ngrams(n, words) ngrams = BayesClassifier.get_features(ngrams) for g in ngrams: count = self.ngram_counts[n].get(g, [0,0]) count[cls] += 1 self.ngram_counts[n][g] = count self.ngram_keys[n].add(g)
def _add_example(self, cls, message): """Add a training example """ words = preprocessing.extract_words(message) if not words: # Treat cases where 'paper cut' is pre-processing as negatives. return self.class_count[cls] += 1 # Update ngram_counts and ngram_keys for ngrams cls for n in (1, 2, 3): ngrams = preprocessing.get_ngrams(n, words) ngrams = BayesClassifier.get_features(ngrams) for g in ngrams: count = self.ngram_counts[n].get(g, [0, 0]) count[cls] += 1 self.ngram_counts[n][g] = count self.ngram_keys[n].add(g)
def classify(self, message, detailed=False): """ 'message' is a string to classify. Return True or False classification. Method is to calculate a log_odds from a liklihood based on trigram, bigram and unigram (p,n) counts in the training set For each trigram return smoothed trigram score if trigram in training set, else for the 2 bigrams in the trigram return smoothed bigram score if bigram in training set, else for the 2 unigrams in the bigram return smoothed unigram score get_score() shows the smoothed scoring <n>gram_score() shows the backoff and smoothing factors """ words = preprocessing.extract_words(message) if not words: return False, 0.0 # Best intuition would be to compute back-off based on counts ngrams = dict( (n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3)) query_vecs = dict( (n, RocchioClassifier.get_query_vec(ngrams[n])) for n in (1, 2, 3)) weights = RocchioClassifier.get_weights() def get_weighted_distance(centroid): return sum( RocchioClassifier.get_distance(centroid[n], query_vecs[n]) * weights[n] for n in (1, 2, 3)) pos_distance = get_weighted_distance(self.pos_centroid) neg_distance = get_weighted_distance(self.neg_centroid) diff = (pos_distance + EPSILON) / (neg_distance + EPSILON) return diff > RocchioClassifier.threshold, math.log(diff)
def classify(self, message, detailed=False): """ 'message' is a string to classify. Return True or False classification. Method is to calculate a log_odds from a liklihood based on trigram, bigram and unigram (p,n) counts in the training set For each trigram return smoothed trigram score if trigram in training set, else for the 2 bigrams in the trigram return smoothed bigram score if bigram in training set, else for the 2 unigrams in the bigram return smoothed unigram score get_score() shows the smoothed scoring <n>gram_score() shows the backoff and smoothing factors """ words = preprocessing.extract_words(message) if not words: return False, 0.0 # Best intuition would be to compute back-off based on counts ngrams = dict((n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3)) query_vecs = dict((n, RocchioClassifier.get_query_vec(ngrams[n])) for n in (1, 2, 3)) weights = RocchioClassifier.get_weights() def get_weighted_distance(centroid): return sum(RocchioClassifier.get_distance(centroid[n], query_vecs[n]) * weights[n] for n in (1, 2, 3)) pos_distance = get_weighted_distance(self.pos_centroid) neg_distance = get_weighted_distance(self.neg_centroid) diff = (pos_distance + EPSILON) / (neg_distance + EPSILON) return diff > RocchioClassifier.threshold, math.log(diff)
def classify(self, message, detailed=False): """ 'message' is a string to classify. Return True or False classification. Method is to calculate a log_odds from a liklihood based on trigram, bigram and unigram (p,n) counts in the training set For each trigram return smoothed trigram score if trigram in training set, else for the 2 bigrams in the trigram return smoothed bigram score if bigram in training set, else for the 2 unigrams in the bigram return smoothed unigram score get_score() shows the smoothed scoring <n>gram_score() shows the backoff and smoothing factors """ def get_docs_with_terms(vectorizer, ngrams): docs = set() for term in ngrams: if term in vectorizer.vocab: docs |= set(vectorizer.tfidf[term].keys()) return docs def get_nearest(K, documents, vectorizer, ngrams, doc_ids): """Return doc ids of K documents nearest query_vec """ # Compute scores and add to a priority queue scores = [] for i in doc_ids: heapq.heappush(scores, (vectorizer.get_distance(i, ngrams), i, documents[i][0])) # Return top K scores return [(cls,i,dist) for dist,i,cls in heapq.nlargest(K,scores)] words = preprocessing.extract_words(message) if not words: return False, 0.0 ngrams = dict((n,preprocessing.get_ngrams(n, words)) for n in (1,2,3)) diffs = {} for n in (1,2,3): doc_ids = get_docs_with_terms(self.vectorizers[n], ngrams[n]) nearest = get_nearest(KnnClassifier.K, self.documents[n], self.vectorizers[n], ngrams[n], doc_ids ) pos = sum((1 if cls else -1) * (KnnClassifier.backoff ** k) for k,(cls,_,_) in enumerate(nearest)) max_pos = sum(KnnClassifier.backoff ** k for k in range(len(nearest))) # pos2/max_pos2 is in range [-1,+1] pos2 = sum((1 if cls else -1) * (KnnClassifier.backoff2 ** (2*k)) for k,(cls,_,_) in enumerate(nearest)) max_pos2 = sum(KnnClassifier.backoff2 ** (2*k) for k in range(len(nearest))) pos *= pos2 max_pos *= max_pos2 diffs[n] = pos/max_pos if max_pos else 0.0 weights = KnnClassifier.get_weights() diff = sum(diffs[n]*weights[n] for n in (1,2,3)) return diff > KnnClassifier.threshold, diff
def classify(self, message, detailed=False): """ 'message' is a string to classify. Return True or False classification. Method is to calculate a log_odds from a liklihood based on trigram, bigram and unigram (p,n) counts in the training set For each trigram return smoothed trigram score if trigram in training set, else for the 2 bigrams in the trigram return smoothed bigram score if bigram in training set, else for the 2 unigrams in the bigram return smoothed unigram score get_score() shows the smoothed scoring <n>gram_score() shows the backoff and smoothing factors """ def get_docs_with_terms(vectorizer, ngrams): docs = set() for term in ngrams: if term in vectorizer.vocab: docs |= set(vectorizer.tfidf[term].keys()) return docs def get_nearest(K, documents, vectorizer, ngrams, doc_ids): """Return doc ids of K documents nearest query_vec """ # Compute scores and add to a priority queue scores = [] for i in doc_ids: heapq.heappush( scores, (vectorizer.get_distance(i, ngrams), i, documents[i][0])) # Return top K scores return [(cls, i, dist) for dist, i, cls in heapq.nlargest(K, scores)] words = preprocessing.extract_words(message) if not words: return False, 0.0 ngrams = dict( (n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3)) diffs = {} for n in (1, 2, 3): doc_ids = get_docs_with_terms(self.vectorizers[n], ngrams[n]) nearest = get_nearest(KnnClassifier.K, self.documents[n], self.vectorizers[n], ngrams[n], doc_ids) pos = sum((1 if cls else -1) * (KnnClassifier.backoff**k) for k, (cls, _, _) in enumerate(nearest)) max_pos = sum(KnnClassifier.backoff**k for k in range(len(nearest))) # pos2/max_pos2 is in range [-1,+1] pos2 = sum((1 if cls else -1) * (KnnClassifier.backoff2**(2 * k)) for k, (cls, _, _) in enumerate(nearest)) max_pos2 = sum(KnnClassifier.backoff2**(2 * k) for k in range(len(nearest))) pos *= pos2 max_pos *= max_pos2 diffs[n] = pos / max_pos if max_pos else 0.0 weights = KnnClassifier.get_weights() diff = sum(diffs[n] * weights[n] for n in (1, 2, 3)) return diff > KnnClassifier.threshold, diff