Ejemplo n.º 1
0
    def _add_example(self, cls, message):
        """Add a training example
        """

        words = preprocessing.extract_words(message)
        if not words:  # Treat cases where 'paper cut' is pre-processing as negatives. 
            return

        # Update self.documents[n] and self.vocab[n] for ngrams   and cls
        for n in (1,2,3):
            ngrams = preprocessing.get_ngrams(n, words)
            self.documents[n].append((cls,ngrams))
Ejemplo n.º 2
0
    def _add_example(self, cls, message):
        """Add a training example
        """

        words = preprocessing.extract_words(message)
        if not words:  # Treat cases where 'paper cut' is pre-processing as negatives.
            return

        # Update self.documents[n] and self.vocab[n] for ngrams   and cls
        for n in (1, 2, 3):
            ngrams = preprocessing.get_ngrams(n, words)
            self.documents[n].append((cls, ngrams))
Ejemplo n.º 3
0
    def _add_example(self, cls, message):
        """Add a training example
        """

        words = preprocessing.extract_words(message)
        if not words:
            return

        documents = self.pos_documents if cls else self.neg_documents

        for n in (1, 2, 3):
            ngrams = preprocessing.get_ngrams(n, words)
            documents[n].append(ngrams)
            for g in ngrams:
                self.vocab[n].add(g)
Ejemplo n.º 4
0
    def _add_example(self, cls, message):
        """Add a training example
        """

        words = preprocessing.extract_words(message)
        if not words:
            return

        documents = self.pos_documents if cls else self.neg_documents

        for n in (1, 2, 3):
            ngrams = preprocessing.get_ngrams(n, words)
            documents[n].append(ngrams)
            for g in ngrams:
                self.vocab[n].add(g)
Ejemplo n.º 5
0
 def _add_example(self, cls, message):
     """Add a training example
     """
     
     words = preprocessing.extract_words(message)
     if not words:  # Treat cases where 'paper cut' is pre-processing as negatives. 
         return
     
     self.class_count[cls] += 1        
     
     # Update ngram_counts and ngram_keys for ngrams cls
     for n in (1,2,3):
         ngrams = preprocessing.get_ngrams(n, words)
         ngrams = BayesClassifier.get_features(ngrams)  
         for g in ngrams:
             count = self.ngram_counts[n].get(g, [0,0])
             count[cls] += 1
             self.ngram_counts[n][g] = count
             self.ngram_keys[n].add(g)    
Ejemplo n.º 6
0
    def _add_example(self, cls, message):
        """Add a training example
        """

        words = preprocessing.extract_words(message)
        if not words:  # Treat cases where 'paper cut' is pre-processing as negatives.
            return

        self.class_count[cls] += 1

        # Update ngram_counts and ngram_keys for ngrams cls
        for n in (1, 2, 3):
            ngrams = preprocessing.get_ngrams(n, words)
            ngrams = BayesClassifier.get_features(ngrams)
            for g in ngrams:
                count = self.ngram_counts[n].get(g, [0, 0])
                count[cls] += 1
                self.ngram_counts[n][g] = count
                self.ngram_keys[n].add(g)
Ejemplo n.º 7
0
    def classify(self, message, detailed=False):
        """ 
            'message' is a string to classify. Return True or False classification.
            
            Method is to calculate a log_odds from a liklihood based on
            trigram, bigram and unigram (p,n) counts in the training set
            For each trigram
                return smoothed trigram score if trigram in training set, else
                for the 2 bigrams in the trigram
                    return smoothed bigram score if bigram in training set, else
                    for the 2 unigrams in the bigram
                        return smoothed unigram score
                        
            get_score() shows the smoothed scoring    
            <n>gram_score() shows the backoff and smoothing factors    
        """

        words = preprocessing.extract_words(message)
        if not words:
            return False, 0.0

        # Best intuition would be to compute back-off based on counts
        ngrams = dict(
            (n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3))

        query_vecs = dict(
            (n, RocchioClassifier.get_query_vec(ngrams[n])) for n in (1, 2, 3))

        weights = RocchioClassifier.get_weights()

        def get_weighted_distance(centroid):
            return sum(
                RocchioClassifier.get_distance(centroid[n], query_vecs[n]) *
                weights[n] for n in (1, 2, 3))

        pos_distance = get_weighted_distance(self.pos_centroid)
        neg_distance = get_weighted_distance(self.neg_centroid)

        diff = (pos_distance + EPSILON) / (neg_distance + EPSILON)
        return diff > RocchioClassifier.threshold, math.log(diff)
Ejemplo n.º 8
0
    def classify(self, message, detailed=False):
        """ 
            'message' is a string to classify. Return True or False classification.
            
            Method is to calculate a log_odds from a liklihood based on
            trigram, bigram and unigram (p,n) counts in the training set
            For each trigram
                return smoothed trigram score if trigram in training set, else
                for the 2 bigrams in the trigram
                    return smoothed bigram score if bigram in training set, else
                    for the 2 unigrams in the bigram
                        return smoothed unigram score
                        
            get_score() shows the smoothed scoring    
            <n>gram_score() shows the backoff and smoothing factors    
        """

        words = preprocessing.extract_words(message)
        if not words:
            return False, 0.0

        # Best intuition would be to compute back-off based on counts
        ngrams = dict((n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3))

        query_vecs = dict((n, RocchioClassifier.get_query_vec(ngrams[n])) for n in (1, 2, 3))

        weights = RocchioClassifier.get_weights()

        def get_weighted_distance(centroid):
            return sum(RocchioClassifier.get_distance(centroid[n], query_vecs[n]) * weights[n] for n in (1, 2, 3))

        pos_distance = get_weighted_distance(self.pos_centroid)
        neg_distance = get_weighted_distance(self.neg_centroid)

        diff = (pos_distance + EPSILON) / (neg_distance + EPSILON)
        return diff > RocchioClassifier.threshold, math.log(diff)
Ejemplo n.º 9
0
    def classify(self, message, detailed=False):
        """ 
            'message' is a string to classify. Return True or False classification.
            
            Method is to calculate a log_odds from a liklihood based on
            trigram, bigram and unigram (p,n) counts in the training set
            For each trigram
                return smoothed trigram score if trigram in training set, else
                for the 2 bigrams in the trigram
                    return smoothed bigram score if bigram in training set, else
                    for the 2 unigrams in the bigram
                        return smoothed unigram score
                        
            get_score() shows the smoothed scoring    
            <n>gram_score() shows the backoff and smoothing factors    
        """

        def get_docs_with_terms(vectorizer, ngrams):
            docs = set()
            for term in ngrams:
                if term in vectorizer.vocab:
                    docs |= set(vectorizer.tfidf[term].keys())
            return docs
            
        def get_nearest(K, documents, vectorizer, ngrams, doc_ids):
            """Return doc ids of K documents nearest query_vec
            """
            # Compute scores and add to a priority queue
            scores = []
            for i in doc_ids:
                heapq.heappush(scores, (vectorizer.get_distance(i, ngrams), i, documents[i][0]))
            # Return top K scores
            return [(cls,i,dist) for dist,i,cls in heapq.nlargest(K,scores)]

        words = preprocessing.extract_words(message)
        if not words:
            return False, 0.0

        ngrams = dict((n,preprocessing.get_ngrams(n, words)) for n in (1,2,3))

        diffs = {}    
        for n in (1,2,3):
            doc_ids = get_docs_with_terms(self.vectorizers[n], ngrams[n])
            nearest = get_nearest(KnnClassifier.K, self.documents[n], self.vectorizers[n], ngrams[n], doc_ids )
                        
            pos = sum((1 if cls else -1) * (KnnClassifier.backoff ** k) for k,(cls,_,_) in enumerate(nearest))
            max_pos = sum(KnnClassifier.backoff ** k for k in range(len(nearest)))
            
            # pos2/max_pos2 is in range [-1,+1]
            pos2 = sum((1 if cls else -1) * (KnnClassifier.backoff2 ** (2*k)) for k,(cls,_,_) in enumerate(nearest))
            max_pos2 = sum(KnnClassifier.backoff2 ** (2*k) for k in range(len(nearest)))
   
            pos *= pos2
            max_pos *= max_pos2

            diffs[n] = pos/max_pos if max_pos else 0.0 
       
        weights = KnnClassifier.get_weights()   
        diff = sum(diffs[n]*weights[n] for n in (1,2,3))      

        return diff > KnnClassifier.threshold, diff
Ejemplo n.º 10
0
    def classify(self, message, detailed=False):
        """ 
            'message' is a string to classify. Return True or False classification.
            
            Method is to calculate a log_odds from a liklihood based on
            trigram, bigram and unigram (p,n) counts in the training set
            For each trigram
                return smoothed trigram score if trigram in training set, else
                for the 2 bigrams in the trigram
                    return smoothed bigram score if bigram in training set, else
                    for the 2 unigrams in the bigram
                        return smoothed unigram score
                        
            get_score() shows the smoothed scoring    
            <n>gram_score() shows the backoff and smoothing factors    
        """
        def get_docs_with_terms(vectorizer, ngrams):
            docs = set()
            for term in ngrams:
                if term in vectorizer.vocab:
                    docs |= set(vectorizer.tfidf[term].keys())
            return docs

        def get_nearest(K, documents, vectorizer, ngrams, doc_ids):
            """Return doc ids of K documents nearest query_vec
            """
            # Compute scores and add to a priority queue
            scores = []
            for i in doc_ids:
                heapq.heappush(
                    scores,
                    (vectorizer.get_distance(i, ngrams), i, documents[i][0]))
            # Return top K scores
            return [(cls, i, dist)
                    for dist, i, cls in heapq.nlargest(K, scores)]

        words = preprocessing.extract_words(message)
        if not words:
            return False, 0.0

        ngrams = dict(
            (n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3))

        diffs = {}
        for n in (1, 2, 3):
            doc_ids = get_docs_with_terms(self.vectorizers[n], ngrams[n])
            nearest = get_nearest(KnnClassifier.K, self.documents[n],
                                  self.vectorizers[n], ngrams[n], doc_ids)

            pos = sum((1 if cls else -1) * (KnnClassifier.backoff**k)
                      for k, (cls, _, _) in enumerate(nearest))
            max_pos = sum(KnnClassifier.backoff**k
                          for k in range(len(nearest)))

            # pos2/max_pos2 is in range [-1,+1]
            pos2 = sum((1 if cls else -1) * (KnnClassifier.backoff2**(2 * k))
                       for k, (cls, _, _) in enumerate(nearest))
            max_pos2 = sum(KnnClassifier.backoff2**(2 * k)
                           for k in range(len(nearest)))

            pos *= pos2
            max_pos *= max_pos2

            diffs[n] = pos / max_pos if max_pos else 0.0

        weights = KnnClassifier.get_weights()
        diff = sum(diffs[n] * weights[n] for n in (1, 2, 3))

        return diff > KnnClassifier.threshold, diff