Esempio n. 1
0
 def __init__(self,
              stop_words_file,
              related_training_data_file,
              awareness_training_data_file,
              needs_training,
              related_classifier_dump_file,
              awareness_classifier_dump_file,
              feature_list_file,
              classifier_type='nb'):
     self.helper = ClassifierHelper()
     self.stop_words = self.init_stop_words(stop_words_file)
     self.feature_list = []
     if needs_training:
         self.related_classifier = self.train_classifier(
             related_training_data_file, related_classifier_dump_file,
             feature_list_file, classifier_type)
         self.awareness_classifier = self.train_classifier(
             awareness_training_data_file, awareness_classifier_dump_file,
             feature_list_file, classifier_type)
     else:
         with open(related_classifier_dump_file, 'rb') as f:
             self.related_classifier = pickle.load(f)
         with open(awareness_classifier_dump_file, 'rb') as f:
             self.awareness_classifier = pickle.load(f)
         with open(feature_list_file, 'r') as f:
             for token in f:
                 self.feature_list.append(token.strip())
Esempio n. 2
0
 def __init__(self, trainingDataFile, classifierDumpFile, datadir):
     # Instantiate classifier helper
     self.helper = ClassifierHelper('%s/%s' % (datadir, 'feature_list.txt'),
                                    '%s/%s' % (datadir, 'stop_words.txt'))
     self.trainingDataFile = trainingDataFile
     self.classifierPickled = classifierDumpFile
     self.last_trained = None
     self.classifier = self._getClassifier()
 def __init__(self):
     '''
     Constructor
     '''
     self.cl_helper = ClassifierHelper()
     self.wsd_helper = WSDHelper()
     self.ab_path = os.path.dirname(os.path.abspath(__file__))
 def __init__(self ,classifier_names = "MaxentClassifier" , domains = "tweets"):
     '''
     Constructor
     '''
     self.cl_helper = ClassifierHelper()
     self.wsd_helper = WSDHelper()
     self.ab_path = os.path.dirname(os.path.abspath(__file__))
     self.loaded_classifiers = self.set_classifiers(classifier_names, domains)
     self.subjective_classifier = pickle.load(open(self.ab_path+'/Data/Pickles/subjective/classifier-MaxentClassifier.rotten.pickle', 'r'))
 def __init__(self, stop_words_file, related_training_data_file, awareness_training_data_file, needs_training,
              related_classifier_dump_file, awareness_classifier_dump_file, feature_list_file,
              classifier_type='nb'):
     self.helper = ClassifierHelper()
     self.stop_words = self.init_stop_words(stop_words_file)
     self.feature_list = []
     if needs_training:
         self.related_classifier = self.train_classifier(related_training_data_file, related_classifier_dump_file,
                                                         feature_list_file,
                                                         classifier_type)
         self.awareness_classifier = self.train_classifier(awareness_training_data_file,
                                                           awareness_classifier_dump_file, feature_list_file,
                                                           classifier_type)
     else:
         with open(related_classifier_dump_file, 'rb') as f:
             self.related_classifier = pickle.load(f)
         with open(awareness_classifier_dump_file, 'rb') as f:
             self.awareness_classifier = pickle.load(f)
         with open(feature_list_file, 'r') as f:
             for token in f:
                 self.feature_list.append(token.strip())
Esempio n. 6
0
class NaiveBayesClassifier:
    """ Naive Bayes Classifier """
    def __init__(self, trainingDataFile, classifierDumpFile, datadir):
        # Instantiate classifier helper
        self.helper = ClassifierHelper('%s/%s' % (datadir, 'feature_list.txt'),
                                       '%s/%s' % (datadir, 'stop_words.txt'))
        self.trainingDataFile = trainingDataFile
        self.classifierPickled = classifierDumpFile
        self.last_trained = None
        self.classifier = self._getClassifier()

    def _getClassifier(self, reload_existing=False):
        import os.path
        # Record time.
        self.time = datetime.now()
        if reload_existing:
            if os.path.exists(self.classifierPickled):
                f1 = open(self.classifierPickled)
                if (f1):
                    self.classifier = pickle.load(f1)
                    f1.close()
                    return
        return self._getNBTrainedClassifer(self.trainingDataFile,
                                           self.classifierPickled)

    def _getUniqData(self, data):
        uniq_data = {}
        for i in data:
            d = data[i]
            u = []
            for element in d:
                if element not in u:
                    u.append(element)
            # end inner loop
            uniq_data[i] = u
        # end outer loop
        return uniq_data

    # start getProcessedTweets
    def _getProcessedTweets(self, data):
        tweets = {}
        for i in data:
            d = data[i]
            tw = []
            for t in d:
                tw.append(self.helper.process_tweet(t))
            tweets[i] = tw
        # end loop
        return tweets

    def _getNBTrainedClassifer(self, trainingDataFile, classifierDumpFile):
        # read all tweets and labels
        tweets = self._getFilteredTrainingData(trainingDataFile)
        training_set = nltk.classify.apply_features(
            self.helper.extract_features, tweets)
        # Write back classifier and word features to a file
        classifier = nltk.NaiveBayesClassifier.train(training_set)
        outfile = open(classifierDumpFile, 'wb')
        pickle.dump(classifier, outfile)
        outfile.close()
        return classifier

    def _getFilteredTrainingData(self, _file):
        inpTweets = csv.reader(open(_file, 'rb'), delimiter=',', quotechar='|')
        count = 0
        featureList = []
        tweets = []
        for row in inpTweets:
            if len(row) < 2:
                continue
            category = row[0]
            tweet = row[1]
            processedTweet = self.helper.process_tweet(tweet)
            featureVector = self.helper.getFeatureVector(processedTweet)
            featureList.extend(featureVector)
            tweets.append((featureVector, category))
        return tweets

    # classify words
    def classify(self, message):
        processedTestTweet = self.helper.process_tweet(message)
        classification = self.classifier.classify(
            self.helper.extract_features(
                self.helper.getFeatureVector(processedTestTweet)))
        return classification
class SentimentClassifier():
    '''
    classdocs
    '''


    def __init__(self):
        '''
        Constructor
        '''
        self.cl_helper = ClassifierHelper()
        self.wsd_helper = WSDHelper()
        self.ab_path = os.path.dirname(os.path.abspath(__file__))
        
    def subjective_and_objective_classification(self ,sentence ):
        classifier = pickle.load(open(self.ab_path+'/Data/Pickles/subjective/classifier-MaxentClassifier.rotten.pickle', 'r'))
        tokens = self.cl_helper.bag_of_words(self.cl_helper.extract_words(sentence ,is_stem = True))
        decision = classifier.classify(tokens)
        subj = classifier.prob_classify(tokens).prob('subjective')
        obj = classifier.prob_classify(tokens).prob('objective')
        print "Subjectivity = %s Objectivity = %s decision = %s" %(subj, obj ,decision)
        if subj > obj:
            return SUBJECTIVE
        else:
            return OBJECTIVE
        
    def classify(self,sentence , classifier_names = "MaxentClassifier" , domain = "tweets"): 
        """Classify the sentence
        Keyword arguments:
        classifier_names -- classifier names as space separated strings
                                   for example if single classifier "MaxentClassifier",
                                   if two classifiers MaxentClassifier NaiveBayes"
        
        domain -- domain of the train corpus
                                 
        :return NaiveBayesClassifier: Corresponding classifier
        """   
        pos = neg = 0
        #NLTK Classifiers Starts Here ----------------------------------------->
        results = []
        req_classifiers = classifier_names.split();
        for classifier_name in req_classifiers:
            if not classifier_name == "WSD-SentiWordNet" :
                pickled_classifier = 'classifier-%s.%s.pickle' % (classifier_name, domain)
                pickle_dir = self.ab_path +'/'+'Data/Pickles/%s/%s' % (domain ,pickled_classifier)
                if not os.path.exists(pickle_dir): 
                    continue
                classifier = pickle.load(open(pickle_dir, 'r'))
                tokens = self.cl_helper.bag_of_words(self.cl_helper.extract_words(sentence))
                #decision = classifier.classify(tokens)
                neg = classifier.prob_classify(tokens).prob('neg')
                pos = classifier.prob_classify(tokens).prob('pos')
                decision = self.prepare_results(pos, neg, classifier_name, domain)
                results.append({
                                "classifier"    : classifier_name,
                                "result"        : decision,
                                "pos_score"     : pos,
                                "neg_score"     : neg
                               })
                
            #WSD Hue Starts Here ----------------------------------------->
            if classifier_name == "WSD-SentiWordNet":
                r = re.compile("[,.?()\\d]+ *")
                lines_list = r.split(sentence)
                pos, neg = self.wsd_helper.call_classifier(lines_list)
                normalize_wsd = pos + neg + 1
                pos = pos/normalize_wsd
                neg = neg/normalize_wsd
            
                print "Results from WSD SentiWordNet on %s Corpus "%domain
                decision = self.prepare_results(pos, neg, classifier_name, domain)
                results.append({
                                "classifier"    : classifier_name,
                                "result"        : decision,
                                "pos_score"     : pos,
                                "neg_score"     : neg
                               })
            #WSD Hue ENDS Here ----------------------------------------->
        return results
            
    def prepare_results(self ,pos, neg, classifier_name ,domain):
        print "Results from %s on %s Corpus" % (classifier_name, domain)
        if abs(pos - neg) <= 0.15 and neg != 0 and pos != 0:
            print "Text is Neutral/Hard To Classify"
            print 'Positive = %s , Negative = %s' % (pos, neg)
            return HARD_TO_CLASSIFY    
        elif pos > neg:
            print " Text is POSITIVE"
            print 'Positive = %s Negative = %s' % (pos, neg)
            return POSITIVE
        else:
            print " Text is NEGATIVE"
            print 'Positive = %s Negative = %s' % (pos, neg)
            return NEGATIVE
Esempio n. 8
0
class MaxEntClassifier:
    def extract_features(self, document):
        document_words = set(document)
        features = {}
        for word in self.feature_list:
            features['contains(%s)' % word] = (word in document_words)
        return features

    def get_feature_vector(self, tweet):
        words = tweet.split()
        features = []
        for word in words:
            word = word.strip('\'"?!,.')
            valid = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", word)
            if word in self.stop_words or valid is None:
                continue
            else:
                features.append(word.lower())
        for gram in nltk.bigrams(words):
            x, y = gram
            valid_x = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", x)
            valid_y = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", y)
            if x in self.stop_words or y in self.stop_words or valid_x is None or valid_y is None:
                continue
            else:
                features.append(gram[0] + " " + gram[1])
        return features

    def __init__(self,
                 stop_words_file,
                 related_training_data_file,
                 awareness_training_data_file,
                 needs_training,
                 related_classifier_dump_file,
                 awareness_classifier_dump_file,
                 feature_list_file,
                 classifier_type='nb'):
        self.helper = ClassifierHelper()
        self.stop_words = self.init_stop_words(stop_words_file)
        self.feature_list = []
        if needs_training:
            self.related_classifier = self.train_classifier(
                related_training_data_file, related_classifier_dump_file,
                feature_list_file, classifier_type)
            self.awareness_classifier = self.train_classifier(
                awareness_training_data_file, awareness_classifier_dump_file,
                feature_list_file, classifier_type)
        else:
            with open(related_classifier_dump_file, 'rb') as f:
                self.related_classifier = pickle.load(f)
            with open(awareness_classifier_dump_file, 'rb') as f:
                self.awareness_classifier = pickle.load(f)
            with open(feature_list_file, 'r') as f:
                for token in f:
                    self.feature_list.append(token.strip())

    def classify_awareness(self, tweet):
        processed_tweet = self.helper.process_tweet(tweet)
        return self.awareness_classifier.classify(
            self.extract_features(self.get_feature_vector(processed_tweet)))

    def classify_related(self, tweet):
        processed_tweet = self.helper.process_tweet(tweet)
        return self.related_classifier.classify(
            self.extract_features(self.get_feature_vector(processed_tweet)))

    def show_informative_features(self, n):
        return self.related_classifier.show_most_informative_features(
            n, show='pos'
        ), self.awareness_classifier.show_most_informative_features(n)

    def train_classifier(self, training_data_file, classifier_dump_file,
                         feature_list_file, classifier_type):
        training_data = csv.reader(codecs.open(training_data_file,
                                               'r',
                                               encoding='UTF-8'),
                                   delimiter=',',
                                   quotechar='|')
        tweets = []
        for row in training_data:
            sentiment = row[0]
            tweet = row[1]
            processed_tweet = self.helper.process_tweet(tweet)
            feature_vector = self.get_feature_vector(processed_tweet)
            self.feature_list.extend(feature_vector)
            tweets.append((feature_vector, sentiment))
        self.feature_list = list(set(self.feature_list))
        training_set = nltk.apply_features(self.extract_features, tweets)

        if classifier_type == 'nb':
            out_classifier = nltk.classify.NaiveBayesClassifier.train(
                training_set)
            with open(classifier_dump_file, 'wb') as f:
                pickle.dump(out_classifier, f)
        elif classifier_type == 'maxent':
            out_classifier = nltk.classify.maxent.MaxentClassifier.train(
                training_set,
                'GIS',
                trace=3,
                labels=None,
                gaussian_prior_sigma=0,
                max_iter=10)
            with open(classifier_dump_file, 'wb') as f:
                pickle.dump(out_classifier, f)

        with open(feature_list_file, 'w') as f:
            for token in self.feature_list:
                f.write(token + '\n')
        return out_classifier

    def init_stop_words(self, stop_words_file):
        stop_words = ['AT_USER', 'URL']
        with open(stop_words_file, 'r') as file:
            for word in file:
                stop_words.append(word.strip())
        return stop_words
class MaxEntClassifier:
    def extract_features(self, document):
        document_words = set(document)
        features = {}
        for word in self.feature_list:
            features['contains(%s)' % word] = (word in document_words)
        return features

    def get_feature_vector(self, tweet):
        words = tweet.split()
        features = []
        for word in words:
            word = word.strip('\'"?!,.')
            valid = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", word)
            if word in self.stop_words or valid is None:
                continue
            else:
                features.append(word.lower())
        for gram in nltk.bigrams(words):
            x, y = gram
            valid_x = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", x)
            valid_y = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", y)
            if x in self.stop_words or y in self.stop_words or valid_x is None or valid_y is None:
                continue
            else:
                features.append(gram[0] + " " + gram[1])
        return features

    def __init__(self, stop_words_file, related_training_data_file, awareness_training_data_file, needs_training,
                 related_classifier_dump_file, awareness_classifier_dump_file, feature_list_file,
                 classifier_type='nb'):
        self.helper = ClassifierHelper()
        self.stop_words = self.init_stop_words(stop_words_file)
        self.feature_list = []
        if needs_training:
            self.related_classifier = self.train_classifier(related_training_data_file, related_classifier_dump_file,
                                                            feature_list_file,
                                                            classifier_type)
            self.awareness_classifier = self.train_classifier(awareness_training_data_file,
                                                              awareness_classifier_dump_file, feature_list_file,
                                                              classifier_type)
        else:
            with open(related_classifier_dump_file, 'rb') as f:
                self.related_classifier = pickle.load(f)
            with open(awareness_classifier_dump_file, 'rb') as f:
                self.awareness_classifier = pickle.load(f)
            with open(feature_list_file, 'r') as f:
                for token in f:
                    self.feature_list.append(token.strip())

    def classify_awareness(self, tweet):
        processed_tweet = self.helper.process_tweet(tweet)
        return self.awareness_classifier.classify(self.extract_features(self.get_feature_vector(processed_tweet)))

    def classify_related(self, tweet):
        processed_tweet = self.helper.process_tweet(tweet)
        return self.related_classifier.classify(self.extract_features(self.get_feature_vector(processed_tweet)))

    def show_informative_features(self, n):
        return self.related_classifier.show_most_informative_features(n, show='pos'), self.awareness_classifier.show_most_informative_features(n)

    def train_classifier(self, training_data_file, classifier_dump_file, feature_list_file, classifier_type):
        training_data = csv.reader(codecs.open(training_data_file, 'r', encoding='UTF-8'), delimiter=',', quotechar='|')
        tweets = []
        for row in training_data:
            sentiment = row[0]
            tweet = row[1]
            processed_tweet = self.helper.process_tweet(tweet)
            feature_vector = self.get_feature_vector(processed_tweet)
            self.feature_list.extend(feature_vector)
            tweets.append((feature_vector, sentiment))
        self.feature_list = list(set(self.feature_list))
        training_set = nltk.apply_features(self.extract_features, tweets)

        if classifier_type == 'nb':
            out_classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
            with open(classifier_dump_file, 'wb') as f:
                pickle.dump(out_classifier, f)
        elif classifier_type == 'maxent':
            out_classifier = nltk.classify.maxent.MaxentClassifier.train(training_set, 'GIS', trace=3, labels=None,
                                                                         gaussian_prior_sigma=0, max_iter=10)
            with open(classifier_dump_file, 'wb') as f:
                pickle.dump(out_classifier, f)

        with open(feature_list_file, 'w') as f:
            for token in self.feature_list:
                f.write(token + '\n')
        return out_classifier

    def init_stop_words(self, stop_words_file):
        stop_words = ['AT_USER', 'URL']
        with open(stop_words_file, 'r') as file:
            for word in file:
                stop_words.append(word.strip())
        return stop_words
class SentimentClassifier():
    '''
    classdocs
    '''


    def __init__(self ,classifier_names = "MaxentClassifier" , domains = "tweets"):
        '''
        Constructor
        '''
        self.cl_helper = ClassifierHelper()
        self.wsd_helper = WSDHelper()
        self.ab_path = os.path.dirname(os.path.abspath(__file__))
        self.loaded_classifiers = self.set_classifiers(classifier_names, domains)
        self.subjective_classifier = pickle.load(open(self.ab_path+'/Data/Pickles/subjective/classifier-MaxentClassifier.rotten.pickle', 'r'))
        
    def subjective_and_objective_classification(self ,sentence ):
        tokens = self.cl_helper.bag_of_words(self.cl_helper.extract_words(sentence ,is_stem = True))
        decision = self.subjective_classifier.classify(tokens)
        subj = self.subjective_classifier.prob_classify(tokens).prob('subjective')
        obj = self.subjective_classifier.prob_classify(tokens).prob('objective')
        print "Subjectivity = %s Objectivity = %s decision = %s" %(subj, obj ,decision)
        if subj > obj:
            return SUBJECTIVE
        else:
            return OBJECTIVE
        
    def textblob_results(self ,sentence):
        testimonial = TextBlob(sentence)
        subjectivtiy = SUBJECTIVE
        if testimonial.subjectivity > 0.3:
            subjectivtiy = SUBJECTIVE
        else:
            subjectivtiy =  OBJECTIVE
        
        print "Subjectivity from Textblob %s" %(testimonial.subjectivity )   
        textblob_decision = self.prepare_textblob_results(testimonial.polarity)
        return (subjectivtiy , textblob_decision)
        
    def classify(self,sentence , classifier_names = "MaxentClassifier" , domain = "tweets"): 
        """Classify the sentence
        Keyword arguments:
        classifier_names -- classifier names as space separated strings
                                   for example if single classifier "MaxentClassifier",
                                   if two classifiers MaxentClassifier NaiveBayes"
        
        domain -- domain of the train corpus
                                 
        :return NaiveBayesClassifier: Corresponding classifier
        """   
        pos = neg = 0
        #NLTK Classifiers Starts Here ----------------------------------------->
        results = []
        req_classifiers = classifier_names.split();
        for classifier_name in req_classifiers:
            if not classifier_name == "WSD-SentiWordNet" :
                key = classifier_name+"_"+domain
                try:
                    classifier = self.loaded_classifiers[key]
                except KeyError:
                    # Key is not present
                    classifier = self.load_classifiers(classifier_name, domain)
                    pass            
                tokens = self.cl_helper.bag_of_words(self.cl_helper.extract_words(sentence))
                #decision = classifier.classify(tokens)
                neg = classifier.prob_classify(tokens).prob('neg')
                pos = classifier.prob_classify(tokens).prob('pos')
                decision = self.prepare_results(pos, neg, classifier_name, domain)
                results.append({
                                "classifier"    : classifier_name,
                                "result"        : decision,
                                "pos_score"     : pos,
                                "neg_score"     : neg
                               })
                
            #WSD Hue Starts Here ----------------------------------------->
            if classifier_name == "WSD-SentiWordNet":
                r = re.compile("[,.?()\\d]+ *")
                lines_list = r.split(sentence)
                pos, neg = self.wsd_helper.call_classifier(lines_list)
                normalize_wsd = pos + neg + 1
                pos = pos/normalize_wsd
                neg = neg/normalize_wsd
                #print abs(pos - neg)
                #print "Results from WSD SentiWordNet on %s Corpus "%domain
                decision = self.prepare_results(pos, neg, classifier_name, domain)
                results.append({
                                "classifier"    : classifier_name,
                                "result"        : decision,
                                "pos_score"     : pos,
                                "neg_score"     : neg
                               })
            #WSD Hue ENDS Here ----------------------------------------->
        return results
            
    def prepare_results(self ,pos, neg, classifier_name ,domain):
        print "Results from %s on %s Corpus" % (classifier_name, domain)
        if abs(pos - neg) <= HARD_THRESHOLD:# and neg is not 0 and pos is not 0:
            print " Text is Neutral/Hard To Classify"
            print ' Positive = %s , Negative = %s' % (pos, neg)
            return HARD_TO_CLASSIFY    
        elif pos > neg:
            print " Text is POSITIVE"
            print ' Positive = %s Negative = %s' % (pos, neg)
            return POSITIVE
        elif pos < neg:
            print " Text is NEGATIVE"
            print ' Positive = %s Negative = %s' % (pos, neg)
            return NEGATIVE
        else:
            return HARD_TO_CLASSIFY 
    
    def prepare_textblob_results(self , textblob_polarity):
        print "Results from TextBlob pattern" 
        polarity = textblob_polarity
        print ' Polarity %s' % polarity
        if -0.25 <= polarity <= 0.25:
            print " Text is Neutral/Hard To Classify"
            return HARD_TO_CLASSIFY   
        elif -0.25 > polarity:
            print " Text is NEGATIVE"
            return NEGATIVE
        elif polarity > 0.25:
            print " Text is POSITIVE"
            return POSITIVE
        else:
            return HARD_TO_CLASSIFY 
            
        
    def load_classifiers(self ,classifier_name = "MaxentClassifier" , domain_name = "tweets"):
        pickled_classifier = 'classifier-%s.%s.pickle' % (classifier_name, domain_name)
        pickle_dir = self.ab_path +'/'+'Data/Pickles/%s/%s' % (domain_name ,pickled_classifier)
        if not os.path.exists(pickle_dir): 
            return None
        classifier = pickle.load(open(pickle_dir, 'r'))
        return classifier
                
    def set_classifiers(self ,classifier_names = "MaxentClassifier" , domains = "tweets"):
        print "Loading classfiers..."
        req_classifiers = classifier_names.split()
        req_domains = domains.split()
        loaded_classifiers = {}
        for classifier_name in req_classifiers:
            if classifier_name == "WSD-SentiWordNet":
                continue
            
            for domain_name in req_domains:
                classifeir = self.load_classifiers(classifier_name, domain_name)
                if classifeir is None:
                    continue
                classifier_key_name = classifier_name+"_"+domain_name
                loaded_classifiers[classifier_key_name] = classifeir
                print "Classifier %s loaded!..." %(classifier_key_name)
                
        return loaded_classifiers