def test_extract_words():

    text = "This movie, while not great, was decent"
    words = preprocessing.extract_words(text)

    assert_equals(['this', 'movie', 'while', 'not', 'not_great', 'was', 'decent'], words)

    words = preprocessing.extract_words(text, use_negation=False)
    assert_equals(['this', 'movie', 'while', 'not', 'great', 'was', 'decent'], words)
    def _add_example(self, cls, message):
        """Add a training example
        """

        words = preprocessing.extract_words(message)
        if not words:  # Treat cases where 'paper cut' is pre-processing as negatives. 
            return

        # Update self.documents[n] and self.vocab[n] for ngrams   and cls
        for n in (1,2,3):
            ngrams = preprocessing.get_ngrams(n, words)
            self.documents[n].append((cls,ngrams))
Exemple #3
0
    def _add_example(self, cls, message):
        """Add a training example
        """

        words = preprocessing.extract_words(message)
        if not words:  # Treat cases where 'paper cut' is pre-processing as negatives.
            return

        # Update self.documents[n] and self.vocab[n] for ngrams   and cls
        for n in (1, 2, 3):
            ngrams = preprocessing.get_ngrams(n, words)
            self.documents[n].append((cls, ngrams))
    def _add_example(self, cls, message):
        """Add a training example
        """

        words = preprocessing.extract_words(message)
        if not words:
            return

        documents = self.pos_documents if cls else self.neg_documents

        for n in (1, 2, 3):
            ngrams = preprocessing.get_ngrams(n, words)
            documents[n].append(ngrams)
            for g in ngrams:
                self.vocab[n].add(g)
Exemple #5
0
    def _add_example(self, cls, message):
        """Add a training example
        """

        words = preprocessing.extract_words(message)
        if not words:
            return

        documents = self.pos_documents if cls else self.neg_documents

        for n in (1, 2, 3):
            ngrams = preprocessing.get_ngrams(n, words)
            documents[n].append(ngrams)
            for g in ngrams:
                self.vocab[n].add(g)
 def _add_example(self, cls, message):
     """Add a training example
     """
     
     words = preprocessing.extract_words(message)
     if not words:  # Treat cases where 'paper cut' is pre-processing as negatives. 
         return
     
     self.class_count[cls] += 1        
     
     # Update ngram_counts and ngram_keys for ngrams cls
     for n in (1,2,3):
         ngrams = preprocessing.get_ngrams(n, words)
         ngrams = BayesClassifier.get_features(ngrams)  
         for g in ngrams:
             count = self.ngram_counts[n].get(g, [0,0])
             count[cls] += 1
             self.ngram_counts[n][g] = count
             self.ngram_keys[n].add(g)    
    def _add_example(self, cls, message):
        """Add a training example
        """

        words = preprocessing.extract_words(message)
        if not words:  # Treat cases where 'paper cut' is pre-processing as negatives.
            return

        self.class_count[cls] += 1

        # Update ngram_counts and ngram_keys for ngrams cls
        for n in (1, 2, 3):
            ngrams = preprocessing.get_ngrams(n, words)
            ngrams = BayesClassifier.get_features(ngrams)
            for g in ngrams:
                count = self.ngram_counts[n].get(g, [0, 0])
                count[cls] += 1
                self.ngram_counts[n][g] = count
                self.ngram_keys[n].add(g)
Exemple #8
0
    def classify(self, message, detailed=False):
        """ 
            'message' is a string to classify. Return True or False classification.
            
            Method is to calculate a log_odds from a liklihood based on
            trigram, bigram and unigram (p,n) counts in the training set
            For each trigram
                return smoothed trigram score if trigram in training set, else
                for the 2 bigrams in the trigram
                    return smoothed bigram score if bigram in training set, else
                    for the 2 unigrams in the bigram
                        return smoothed unigram score
                        
            get_score() shows the smoothed scoring    
            <n>gram_score() shows the backoff and smoothing factors    
        """

        words = preprocessing.extract_words(message)
        if not words:
            return False, 0.0

        # Best intuition would be to compute back-off based on counts
        ngrams = dict(
            (n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3))

        query_vecs = dict(
            (n, RocchioClassifier.get_query_vec(ngrams[n])) for n in (1, 2, 3))

        weights = RocchioClassifier.get_weights()

        def get_weighted_distance(centroid):
            return sum(
                RocchioClassifier.get_distance(centroid[n], query_vecs[n]) *
                weights[n] for n in (1, 2, 3))

        pos_distance = get_weighted_distance(self.pos_centroid)
        neg_distance = get_weighted_distance(self.neg_centroid)

        diff = (pos_distance + EPSILON) / (neg_distance + EPSILON)
        return diff > RocchioClassifier.threshold, math.log(diff)
def decide(text):
    dictionary = {}
    preprocessing.handle_file_text("demo", text, 3, dictionary)
    value_list = []
    row_list = []
    col_list = []
    row = 0

    for key in dictionary.keys():
        dictionary[key] = Counter(dictionary[key])
    lookup = {word: index for index, word in enumerate(word_list)}

    words = set(preprocessing.extract_words(text))
    for word in words:
        if word in lookup:
            value_list.append(dictionary['demo'][word])
            row_list.append(row)
            col_list.append(lookup[word])
    row += 1
    input_matrix = csr_matrix((value_list, (row_list, col_list)), shape=(1, len(word_list)), dtype=np.int8)

    return pipeline.predict(input_matrix)
    def classify(self, message, detailed=False):
        """ 
            'message' is a string to classify. Return True or False classification.
            
            Method is to calculate a log_odds from a liklihood based on
            trigram, bigram and unigram (p,n) counts in the training set
            For each trigram
                return smoothed trigram score if trigram in training set, else
                for the 2 bigrams in the trigram
                    return smoothed bigram score if bigram in training set, else
                    for the 2 unigrams in the bigram
                        return smoothed unigram score
                        
            get_score() shows the smoothed scoring    
            <n>gram_score() shows the backoff and smoothing factors    
        """

        words = preprocessing.extract_words(message)
        if not words:
            return False, 0.0

        # Best intuition would be to compute back-off based on counts
        ngrams = dict((n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3))

        query_vecs = dict((n, RocchioClassifier.get_query_vec(ngrams[n])) for n in (1, 2, 3))

        weights = RocchioClassifier.get_weights()

        def get_weighted_distance(centroid):
            return sum(RocchioClassifier.get_distance(centroid[n], query_vecs[n]) * weights[n] for n in (1, 2, 3))

        pos_distance = get_weighted_distance(self.pos_centroid)
        neg_distance = get_weighted_distance(self.neg_centroid)

        diff = (pos_distance + EPSILON) / (neg_distance + EPSILON)
        return diff > RocchioClassifier.threshold, math.log(diff)
    def classify(self, message, detailed=False):
        """ 
            'message' is a string to classify. Return True or False classification.
            
            Method is to calculate a log_odds from a liklihood based on
            trigram, bigram and unigram (p,n) counts in the training set
            For each trigram
                return smoothed trigram score if trigram in training set, else
                for the 2 bigrams in the trigram
                    return smoothed bigram score if bigram in training set, else
                    for the 2 unigrams in the bigram
                        return smoothed unigram score
                        
            get_score() shows the smoothed scoring    
            <n>gram_score() shows the backoff and smoothing factors    
        """

        def get_docs_with_terms(vectorizer, ngrams):
            docs = set()
            for term in ngrams:
                if term in vectorizer.vocab:
                    docs |= set(vectorizer.tfidf[term].keys())
            return docs
            
        def get_nearest(K, documents, vectorizer, ngrams, doc_ids):
            """Return doc ids of K documents nearest query_vec
            """
            # Compute scores and add to a priority queue
            scores = []
            for i in doc_ids:
                heapq.heappush(scores, (vectorizer.get_distance(i, ngrams), i, documents[i][0]))
            # Return top K scores
            return [(cls,i,dist) for dist,i,cls in heapq.nlargest(K,scores)]

        words = preprocessing.extract_words(message)
        if not words:
            return False, 0.0

        ngrams = dict((n,preprocessing.get_ngrams(n, words)) for n in (1,2,3))

        diffs = {}    
        for n in (1,2,3):
            doc_ids = get_docs_with_terms(self.vectorizers[n], ngrams[n])
            nearest = get_nearest(KnnClassifier.K, self.documents[n], self.vectorizers[n], ngrams[n], doc_ids )
                        
            pos = sum((1 if cls else -1) * (KnnClassifier.backoff ** k) for k,(cls,_,_) in enumerate(nearest))
            max_pos = sum(KnnClassifier.backoff ** k for k in range(len(nearest)))
            
            # pos2/max_pos2 is in range [-1,+1]
            pos2 = sum((1 if cls else -1) * (KnnClassifier.backoff2 ** (2*k)) for k,(cls,_,_) in enumerate(nearest))
            max_pos2 = sum(KnnClassifier.backoff2 ** (2*k) for k in range(len(nearest)))
   
            pos *= pos2
            max_pos *= max_pos2

            diffs[n] = pos/max_pos if max_pos else 0.0 
       
        weights = KnnClassifier.get_weights()   
        diff = sum(diffs[n]*weights[n] for n in (1,2,3))      

        return diff > KnnClassifier.threshold, diff
Exemple #12
0
def preprocessed(documents):
    for document in documents:
        document = clean(document)
        document = extract_words(document)
        yield document
Exemple #13
0
    def classify(self, message, detailed=False):
        """ 
            'message' is a string to classify. Return True or False classification.
            
            Method is to calculate a log_odds from a liklihood based on
            trigram, bigram and unigram (p,n) counts in the training set
            For each trigram
                return smoothed trigram score if trigram in training set, else
                for the 2 bigrams in the trigram
                    return smoothed bigram score if bigram in training set, else
                    for the 2 unigrams in the bigram
                        return smoothed unigram score
                        
            get_score() shows the smoothed scoring    
            <n>gram_score() shows the backoff and smoothing factors    
        """
        def get_docs_with_terms(vectorizer, ngrams):
            docs = set()
            for term in ngrams:
                if term in vectorizer.vocab:
                    docs |= set(vectorizer.tfidf[term].keys())
            return docs

        def get_nearest(K, documents, vectorizer, ngrams, doc_ids):
            """Return doc ids of K documents nearest query_vec
            """
            # Compute scores and add to a priority queue
            scores = []
            for i in doc_ids:
                heapq.heappush(
                    scores,
                    (vectorizer.get_distance(i, ngrams), i, documents[i][0]))
            # Return top K scores
            return [(cls, i, dist)
                    for dist, i, cls in heapq.nlargest(K, scores)]

        words = preprocessing.extract_words(message)
        if not words:
            return False, 0.0

        ngrams = dict(
            (n, preprocessing.get_ngrams(n, words)) for n in (1, 2, 3))

        diffs = {}
        for n in (1, 2, 3):
            doc_ids = get_docs_with_terms(self.vectorizers[n], ngrams[n])
            nearest = get_nearest(KnnClassifier.K, self.documents[n],
                                  self.vectorizers[n], ngrams[n], doc_ids)

            pos = sum((1 if cls else -1) * (KnnClassifier.backoff**k)
                      for k, (cls, _, _) in enumerate(nearest))
            max_pos = sum(KnnClassifier.backoff**k
                          for k in range(len(nearest)))

            # pos2/max_pos2 is in range [-1,+1]
            pos2 = sum((1 if cls else -1) * (KnnClassifier.backoff2**(2 * k))
                       for k, (cls, _, _) in enumerate(nearest))
            max_pos2 = sum(KnnClassifier.backoff2**(2 * k)
                           for k in range(len(nearest)))

            pos *= pos2
            max_pos *= max_pos2

            diffs[n] = pos / max_pos if max_pos else 0.0

        weights = KnnClassifier.get_weights()
        diff = sum(diffs[n] * weights[n] for n in (1, 2, 3))

        return diff > KnnClassifier.threshold, diff
Exemple #14
0
    def classify(self, message, detailed=False):
        """message is a string to classify. Return True or False classification.
            
            Method is to calculate a log_odds from a liklihood based on
            trigram, bigram and unigram (pos,neg) counts in the training set
            For each trigram
                return smoothed trigram score if trigram in training set, else
                for the 2 bigrams in the trigram
                    return smoothed bigram score if bigram in training set, else
                    for the 2 unigrams in the bigram
                        return smoothed unigram score
                        
            get_score() shows the smoothed scoring    
            gram_score() shows the backoff and smoothing factors    
        """
        words = preprocessing.extract_words(message)
        if detailed:
            print words
        if not words:
            return False, 0.0    
        
        # Using dicts with offset keys prevents the same ngram being included twice
        ngrams = dict((n,{}) for n in (1,2,3))
          
        from preprocessing import WORD_DELIMITER
        # Best intuition is to compute back-off based on counts
        for i in range(len(words)-3):
            tri = WORD_DELIMITER.join(words[i:i+3])
            if tri in self.ngram_keys[3]:
                ngrams[3][i] = tri
            else:
                for j in (0,1):
                    bi = WORD_DELIMITER.join(words[i+j:i+j+2])
                    if bi in self.ngram_keys[2]:
                        ngrams[2][i+j] = bi
                    else:
                        for k in (0,1):
                            ngrams[1][i+j+k] = words[i+j+k]
        
        for n in (1,2,3):
            ngrams[n] = BayesClassifier.get_features(ngrams[n].values())

        def get_score(counts, cntv, alpha):
            """Get a smoothed score for an ngram
                
                counts = neg,pos 
                    neg = number of negatives for ngram in training set                    
                    pos = number of positives for ngram in training set
                   
                cntv = (cntn,cntp,v) for an ngram training dict
                    cntn: total number of negatives
                    cntp: total number of positives
                    v: number of unique ngrams
                alpha: smoothing factor.  
                
                Returns: a smoothed score for the ngram         
            """
            neg,pos = counts
            if neg == pos:
                return 0
            cntn,cntp,v  = cntv
            return math.log((pos+alpha)/(cntp+v*alpha)) - math.log((neg+alpha)/(cntn+v*alpha)) 

        if detailed:
            def _dbg(n, score, g): print '%d%s [%.2f] %s' % (n, '  ' * (3-n), score, g)
        else:
            def _dbg(n, score, g): pass
 
        weights = BayesClassifier.get_weights()
        smoothings = BayesClassifier.get_smoothings()    

        def ngram_score(n, g):
            score = get_score(self.ngram_counts[n].get(g, [0,0]), self.cntv_ngrams[n], smoothings[n])
            _dbg(n, score, g)
            return score

        neg,pos = self.class_count

        prior = math.log(pos) - math.log(neg)   
        likelihood = sum(weights[n] * sum(ngram_score(n,g) for g in ngrams[n]) for n in (1,2,3))    
        log_odds = prior + likelihood
        
        if detailed:
            n_gram_dict = {}
            for n in (1,2,3):
                for g in ngrams[n]: 
                    n_gram_dict[g] = ngram_score(n,g) * weights[n] 
            print 'ngrams scores --------------'
            for k in sorted(n_gram_dict, key = lambda x: n_gram_dict[x]):
                print '%6.3f : %s ' % (n_gram_dict[k], k)  

        return log_odds > BayesClassifier.threshold, log_odds
    def classify(self, message, detailed=False):
        """message is a string to classify. Return True or False classification.
            
            Method is to calculate a log_odds from a liklihood based on
            trigram, bigram and unigram (pos,neg) counts in the training set
            For each trigram
                return smoothed trigram score if trigram in training set, else
                for the 2 bigrams in the trigram
                    return smoothed bigram score if bigram in training set, else
                    for the 2 unigrams in the bigram
                        return smoothed unigram score
                        
            get_score() shows the smoothed scoring    
            gram_score() shows the backoff and smoothing factors    
        """
        words = preprocessing.extract_words(message)
        if detailed:
            print words
        if not words:
            return False, 0.0

        # Using dicts with offset keys prevents the same ngram being included twice
        ngrams = dict((n, {}) for n in (1, 2, 3))

        from preprocessing import WORD_DELIMITER

        # Best intuition is to compute back-off based on counts
        for i in range(len(words) - 3):
            tri = WORD_DELIMITER.join(words[i : i + 3])
            if tri in self.ngram_keys[3]:
                ngrams[3][i] = tri
            else:
                for j in (0, 1):
                    bi = WORD_DELIMITER.join(words[i + j : i + j + 2])
                    if bi in self.ngram_keys[2]:
                        ngrams[2][i + j] = bi
                    else:
                        for k in (0, 1):
                            ngrams[1][i + j + k] = words[i + j + k]

        for n in (1, 2, 3):
            ngrams[n] = BayesClassifier.get_features(ngrams[n].values())

        def get_score(counts, cntv, alpha):
            """Get a smoothed score for an ngram
                
                counts = neg,pos 
                    neg = number of negatives for ngram in training set                    
                    pos = number of positives for ngram in training set
                   
                cntv = (cntn,cntp,v) for an ngram training dict
                    cntn: total number of negatives
                    cntp: total number of positives
                    v: number of unique ngrams
                alpha: smoothing factor.  
                
                Returns: a smoothed score for the ngram         
            """
            neg, pos = counts
            if neg == pos:
                return 0
            cntn, cntp, v = cntv
            return math.log((pos + alpha) / (cntp + v * alpha)) - math.log((neg + alpha) / (cntn + v * alpha))

        if detailed:

            def _dbg(n, score, g):
                print "%d%s [%.2f] %s" % (n, "  " * (3 - n), score, g)

        else:

            def _dbg(n, score, g):
                pass

        weights = BayesClassifier.get_weights()
        smoothings = BayesClassifier.get_smoothings()

        def ngram_score(n, g):
            score = get_score(self.ngram_counts[n].get(g, [0, 0]), self.cntv_ngrams[n], smoothings[n])
            _dbg(n, score, g)
            return score

        neg, pos = self.class_count

        prior = math.log(pos) - math.log(neg)
        likelihood = sum(weights[n] * sum(ngram_score(n, g) for g in ngrams[n]) for n in (1, 2, 3))
        log_odds = prior + likelihood

        if detailed:
            n_gram_dict = {}
            for n in (1, 2, 3):
                for g in ngrams[n]:
                    n_gram_dict[g] = ngram_score(n, g) * weights[n]
            print "ngrams scores --------------"
            for k in sorted(n_gram_dict, key=lambda x: n_gram_dict[x]):
                print "%6.3f : %s " % (n_gram_dict[k], k)

        return log_odds > BayesClassifier.threshold, log_odds