def __init__(self, stop_words_file, related_training_data_file, awareness_training_data_file, needs_training, related_classifier_dump_file, awareness_classifier_dump_file, feature_list_file, classifier_type='nb'): self.helper = ClassifierHelper() self.stop_words = self.init_stop_words(stop_words_file) self.feature_list = [] if needs_training: self.related_classifier = self.train_classifier( related_training_data_file, related_classifier_dump_file, feature_list_file, classifier_type) self.awareness_classifier = self.train_classifier( awareness_training_data_file, awareness_classifier_dump_file, feature_list_file, classifier_type) else: with open(related_classifier_dump_file, 'rb') as f: self.related_classifier = pickle.load(f) with open(awareness_classifier_dump_file, 'rb') as f: self.awareness_classifier = pickle.load(f) with open(feature_list_file, 'r') as f: for token in f: self.feature_list.append(token.strip())
def __init__(self, trainingDataFile, classifierDumpFile, datadir): # Instantiate classifier helper self.helper = ClassifierHelper('%s/%s' % (datadir, 'feature_list.txt'), '%s/%s' % (datadir, 'stop_words.txt')) self.trainingDataFile = trainingDataFile self.classifierPickled = classifierDumpFile self.last_trained = None self.classifier = self._getClassifier()
def __init__(self): ''' Constructor ''' self.cl_helper = ClassifierHelper() self.wsd_helper = WSDHelper() self.ab_path = os.path.dirname(os.path.abspath(__file__))
def __init__(self ,classifier_names = "MaxentClassifier" , domains = "tweets"): ''' Constructor ''' self.cl_helper = ClassifierHelper() self.wsd_helper = WSDHelper() self.ab_path = os.path.dirname(os.path.abspath(__file__)) self.loaded_classifiers = self.set_classifiers(classifier_names, domains) self.subjective_classifier = pickle.load(open(self.ab_path+'/Data/Pickles/subjective/classifier-MaxentClassifier.rotten.pickle', 'r'))
def __init__(self, stop_words_file, related_training_data_file, awareness_training_data_file, needs_training, related_classifier_dump_file, awareness_classifier_dump_file, feature_list_file, classifier_type='nb'): self.helper = ClassifierHelper() self.stop_words = self.init_stop_words(stop_words_file) self.feature_list = [] if needs_training: self.related_classifier = self.train_classifier(related_training_data_file, related_classifier_dump_file, feature_list_file, classifier_type) self.awareness_classifier = self.train_classifier(awareness_training_data_file, awareness_classifier_dump_file, feature_list_file, classifier_type) else: with open(related_classifier_dump_file, 'rb') as f: self.related_classifier = pickle.load(f) with open(awareness_classifier_dump_file, 'rb') as f: self.awareness_classifier = pickle.load(f) with open(feature_list_file, 'r') as f: for token in f: self.feature_list.append(token.strip())
class NaiveBayesClassifier: """ Naive Bayes Classifier """ def __init__(self, trainingDataFile, classifierDumpFile, datadir): # Instantiate classifier helper self.helper = ClassifierHelper('%s/%s' % (datadir, 'feature_list.txt'), '%s/%s' % (datadir, 'stop_words.txt')) self.trainingDataFile = trainingDataFile self.classifierPickled = classifierDumpFile self.last_trained = None self.classifier = self._getClassifier() def _getClassifier(self, reload_existing=False): import os.path # Record time. self.time = datetime.now() if reload_existing: if os.path.exists(self.classifierPickled): f1 = open(self.classifierPickled) if (f1): self.classifier = pickle.load(f1) f1.close() return return self._getNBTrainedClassifer(self.trainingDataFile, self.classifierPickled) def _getUniqData(self, data): uniq_data = {} for i in data: d = data[i] u = [] for element in d: if element not in u: u.append(element) # end inner loop uniq_data[i] = u # end outer loop return uniq_data # start getProcessedTweets def _getProcessedTweets(self, data): tweets = {} for i in data: d = data[i] tw = [] for t in d: tw.append(self.helper.process_tweet(t)) tweets[i] = tw # end loop return tweets def _getNBTrainedClassifer(self, trainingDataFile, classifierDumpFile): # read all tweets and labels tweets = self._getFilteredTrainingData(trainingDataFile) training_set = nltk.classify.apply_features( self.helper.extract_features, tweets) # Write back classifier and word features to a file classifier = nltk.NaiveBayesClassifier.train(training_set) outfile = open(classifierDumpFile, 'wb') pickle.dump(classifier, outfile) outfile.close() return classifier def _getFilteredTrainingData(self, _file): inpTweets = csv.reader(open(_file, 'rb'), delimiter=',', quotechar='|') count = 0 featureList = [] tweets = [] for row in inpTweets: if len(row) < 2: continue category = row[0] tweet = row[1] processedTweet = self.helper.process_tweet(tweet) featureVector = self.helper.getFeatureVector(processedTweet) featureList.extend(featureVector) tweets.append((featureVector, category)) return tweets # classify words def classify(self, message): processedTestTweet = self.helper.process_tweet(message) classification = self.classifier.classify( self.helper.extract_features( self.helper.getFeatureVector(processedTestTweet))) return classification
class SentimentClassifier(): ''' classdocs ''' def __init__(self): ''' Constructor ''' self.cl_helper = ClassifierHelper() self.wsd_helper = WSDHelper() self.ab_path = os.path.dirname(os.path.abspath(__file__)) def subjective_and_objective_classification(self ,sentence ): classifier = pickle.load(open(self.ab_path+'/Data/Pickles/subjective/classifier-MaxentClassifier.rotten.pickle', 'r')) tokens = self.cl_helper.bag_of_words(self.cl_helper.extract_words(sentence ,is_stem = True)) decision = classifier.classify(tokens) subj = classifier.prob_classify(tokens).prob('subjective') obj = classifier.prob_classify(tokens).prob('objective') print "Subjectivity = %s Objectivity = %s decision = %s" %(subj, obj ,decision) if subj > obj: return SUBJECTIVE else: return OBJECTIVE def classify(self,sentence , classifier_names = "MaxentClassifier" , domain = "tweets"): """Classify the sentence Keyword arguments: classifier_names -- classifier names as space separated strings for example if single classifier "MaxentClassifier", if two classifiers MaxentClassifier NaiveBayes" domain -- domain of the train corpus :return NaiveBayesClassifier: Corresponding classifier """ pos = neg = 0 #NLTK Classifiers Starts Here -----------------------------------------> results = [] req_classifiers = classifier_names.split(); for classifier_name in req_classifiers: if not classifier_name == "WSD-SentiWordNet" : pickled_classifier = 'classifier-%s.%s.pickle' % (classifier_name, domain) pickle_dir = self.ab_path +'/'+'Data/Pickles/%s/%s' % (domain ,pickled_classifier) if not os.path.exists(pickle_dir): continue classifier = pickle.load(open(pickle_dir, 'r')) tokens = self.cl_helper.bag_of_words(self.cl_helper.extract_words(sentence)) #decision = classifier.classify(tokens) neg = classifier.prob_classify(tokens).prob('neg') pos = classifier.prob_classify(tokens).prob('pos') decision = self.prepare_results(pos, neg, classifier_name, domain) results.append({ "classifier" : classifier_name, "result" : decision, "pos_score" : pos, "neg_score" : neg }) #WSD Hue Starts Here -----------------------------------------> if classifier_name == "WSD-SentiWordNet": r = re.compile("[,.?()\\d]+ *") lines_list = r.split(sentence) pos, neg = self.wsd_helper.call_classifier(lines_list) normalize_wsd = pos + neg + 1 pos = pos/normalize_wsd neg = neg/normalize_wsd print "Results from WSD SentiWordNet on %s Corpus "%domain decision = self.prepare_results(pos, neg, classifier_name, domain) results.append({ "classifier" : classifier_name, "result" : decision, "pos_score" : pos, "neg_score" : neg }) #WSD Hue ENDS Here -----------------------------------------> return results def prepare_results(self ,pos, neg, classifier_name ,domain): print "Results from %s on %s Corpus" % (classifier_name, domain) if abs(pos - neg) <= 0.15 and neg != 0 and pos != 0: print "Text is Neutral/Hard To Classify" print 'Positive = %s , Negative = %s' % (pos, neg) return HARD_TO_CLASSIFY elif pos > neg: print " Text is POSITIVE" print 'Positive = %s Negative = %s' % (pos, neg) return POSITIVE else: print " Text is NEGATIVE" print 'Positive = %s Negative = %s' % (pos, neg) return NEGATIVE
class MaxEntClassifier: def extract_features(self, document): document_words = set(document) features = {} for word in self.feature_list: features['contains(%s)' % word] = (word in document_words) return features def get_feature_vector(self, tweet): words = tweet.split() features = [] for word in words: word = word.strip('\'"?!,.') valid = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", word) if word in self.stop_words or valid is None: continue else: features.append(word.lower()) for gram in nltk.bigrams(words): x, y = gram valid_x = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", x) valid_y = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", y) if x in self.stop_words or y in self.stop_words or valid_x is None or valid_y is None: continue else: features.append(gram[0] + " " + gram[1]) return features def __init__(self, stop_words_file, related_training_data_file, awareness_training_data_file, needs_training, related_classifier_dump_file, awareness_classifier_dump_file, feature_list_file, classifier_type='nb'): self.helper = ClassifierHelper() self.stop_words = self.init_stop_words(stop_words_file) self.feature_list = [] if needs_training: self.related_classifier = self.train_classifier( related_training_data_file, related_classifier_dump_file, feature_list_file, classifier_type) self.awareness_classifier = self.train_classifier( awareness_training_data_file, awareness_classifier_dump_file, feature_list_file, classifier_type) else: with open(related_classifier_dump_file, 'rb') as f: self.related_classifier = pickle.load(f) with open(awareness_classifier_dump_file, 'rb') as f: self.awareness_classifier = pickle.load(f) with open(feature_list_file, 'r') as f: for token in f: self.feature_list.append(token.strip()) def classify_awareness(self, tweet): processed_tweet = self.helper.process_tweet(tweet) return self.awareness_classifier.classify( self.extract_features(self.get_feature_vector(processed_tweet))) def classify_related(self, tweet): processed_tweet = self.helper.process_tweet(tweet) return self.related_classifier.classify( self.extract_features(self.get_feature_vector(processed_tweet))) def show_informative_features(self, n): return self.related_classifier.show_most_informative_features( n, show='pos' ), self.awareness_classifier.show_most_informative_features(n) def train_classifier(self, training_data_file, classifier_dump_file, feature_list_file, classifier_type): training_data = csv.reader(codecs.open(training_data_file, 'r', encoding='UTF-8'), delimiter=',', quotechar='|') tweets = [] for row in training_data: sentiment = row[0] tweet = row[1] processed_tweet = self.helper.process_tweet(tweet) feature_vector = self.get_feature_vector(processed_tweet) self.feature_list.extend(feature_vector) tweets.append((feature_vector, sentiment)) self.feature_list = list(set(self.feature_list)) training_set = nltk.apply_features(self.extract_features, tweets) if classifier_type == 'nb': out_classifier = nltk.classify.NaiveBayesClassifier.train( training_set) with open(classifier_dump_file, 'wb') as f: pickle.dump(out_classifier, f) elif classifier_type == 'maxent': out_classifier = nltk.classify.maxent.MaxentClassifier.train( training_set, 'GIS', trace=3, labels=None, gaussian_prior_sigma=0, max_iter=10) with open(classifier_dump_file, 'wb') as f: pickle.dump(out_classifier, f) with open(feature_list_file, 'w') as f: for token in self.feature_list: f.write(token + '\n') return out_classifier def init_stop_words(self, stop_words_file): stop_words = ['AT_USER', 'URL'] with open(stop_words_file, 'r') as file: for word in file: stop_words.append(word.strip()) return stop_words
class MaxEntClassifier: def extract_features(self, document): document_words = set(document) features = {} for word in self.feature_list: features['contains(%s)' % word] = (word in document_words) return features def get_feature_vector(self, tweet): words = tweet.split() features = [] for word in words: word = word.strip('\'"?!,.') valid = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", word) if word in self.stop_words or valid is None: continue else: features.append(word.lower()) for gram in nltk.bigrams(words): x, y = gram valid_x = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", x) valid_y = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", y) if x in self.stop_words or y in self.stop_words or valid_x is None or valid_y is None: continue else: features.append(gram[0] + " " + gram[1]) return features def __init__(self, stop_words_file, related_training_data_file, awareness_training_data_file, needs_training, related_classifier_dump_file, awareness_classifier_dump_file, feature_list_file, classifier_type='nb'): self.helper = ClassifierHelper() self.stop_words = self.init_stop_words(stop_words_file) self.feature_list = [] if needs_training: self.related_classifier = self.train_classifier(related_training_data_file, related_classifier_dump_file, feature_list_file, classifier_type) self.awareness_classifier = self.train_classifier(awareness_training_data_file, awareness_classifier_dump_file, feature_list_file, classifier_type) else: with open(related_classifier_dump_file, 'rb') as f: self.related_classifier = pickle.load(f) with open(awareness_classifier_dump_file, 'rb') as f: self.awareness_classifier = pickle.load(f) with open(feature_list_file, 'r') as f: for token in f: self.feature_list.append(token.strip()) def classify_awareness(self, tweet): processed_tweet = self.helper.process_tweet(tweet) return self.awareness_classifier.classify(self.extract_features(self.get_feature_vector(processed_tweet))) def classify_related(self, tweet): processed_tweet = self.helper.process_tweet(tweet) return self.related_classifier.classify(self.extract_features(self.get_feature_vector(processed_tweet))) def show_informative_features(self, n): return self.related_classifier.show_most_informative_features(n, show='pos'), self.awareness_classifier.show_most_informative_features(n) def train_classifier(self, training_data_file, classifier_dump_file, feature_list_file, classifier_type): training_data = csv.reader(codecs.open(training_data_file, 'r', encoding='UTF-8'), delimiter=',', quotechar='|') tweets = [] for row in training_data: sentiment = row[0] tweet = row[1] processed_tweet = self.helper.process_tweet(tweet) feature_vector = self.get_feature_vector(processed_tweet) self.feature_list.extend(feature_vector) tweets.append((feature_vector, sentiment)) self.feature_list = list(set(self.feature_list)) training_set = nltk.apply_features(self.extract_features, tweets) if classifier_type == 'nb': out_classifier = nltk.classify.NaiveBayesClassifier.train(training_set) with open(classifier_dump_file, 'wb') as f: pickle.dump(out_classifier, f) elif classifier_type == 'maxent': out_classifier = nltk.classify.maxent.MaxentClassifier.train(training_set, 'GIS', trace=3, labels=None, gaussian_prior_sigma=0, max_iter=10) with open(classifier_dump_file, 'wb') as f: pickle.dump(out_classifier, f) with open(feature_list_file, 'w') as f: for token in self.feature_list: f.write(token + '\n') return out_classifier def init_stop_words(self, stop_words_file): stop_words = ['AT_USER', 'URL'] with open(stop_words_file, 'r') as file: for word in file: stop_words.append(word.strip()) return stop_words
class SentimentClassifier(): ''' classdocs ''' def __init__(self ,classifier_names = "MaxentClassifier" , domains = "tweets"): ''' Constructor ''' self.cl_helper = ClassifierHelper() self.wsd_helper = WSDHelper() self.ab_path = os.path.dirname(os.path.abspath(__file__)) self.loaded_classifiers = self.set_classifiers(classifier_names, domains) self.subjective_classifier = pickle.load(open(self.ab_path+'/Data/Pickles/subjective/classifier-MaxentClassifier.rotten.pickle', 'r')) def subjective_and_objective_classification(self ,sentence ): tokens = self.cl_helper.bag_of_words(self.cl_helper.extract_words(sentence ,is_stem = True)) decision = self.subjective_classifier.classify(tokens) subj = self.subjective_classifier.prob_classify(tokens).prob('subjective') obj = self.subjective_classifier.prob_classify(tokens).prob('objective') print "Subjectivity = %s Objectivity = %s decision = %s" %(subj, obj ,decision) if subj > obj: return SUBJECTIVE else: return OBJECTIVE def textblob_results(self ,sentence): testimonial = TextBlob(sentence) subjectivtiy = SUBJECTIVE if testimonial.subjectivity > 0.3: subjectivtiy = SUBJECTIVE else: subjectivtiy = OBJECTIVE print "Subjectivity from Textblob %s" %(testimonial.subjectivity ) textblob_decision = self.prepare_textblob_results(testimonial.polarity) return (subjectivtiy , textblob_decision) def classify(self,sentence , classifier_names = "MaxentClassifier" , domain = "tweets"): """Classify the sentence Keyword arguments: classifier_names -- classifier names as space separated strings for example if single classifier "MaxentClassifier", if two classifiers MaxentClassifier NaiveBayes" domain -- domain of the train corpus :return NaiveBayesClassifier: Corresponding classifier """ pos = neg = 0 #NLTK Classifiers Starts Here -----------------------------------------> results = [] req_classifiers = classifier_names.split(); for classifier_name in req_classifiers: if not classifier_name == "WSD-SentiWordNet" : key = classifier_name+"_"+domain try: classifier = self.loaded_classifiers[key] except KeyError: # Key is not present classifier = self.load_classifiers(classifier_name, domain) pass tokens = self.cl_helper.bag_of_words(self.cl_helper.extract_words(sentence)) #decision = classifier.classify(tokens) neg = classifier.prob_classify(tokens).prob('neg') pos = classifier.prob_classify(tokens).prob('pos') decision = self.prepare_results(pos, neg, classifier_name, domain) results.append({ "classifier" : classifier_name, "result" : decision, "pos_score" : pos, "neg_score" : neg }) #WSD Hue Starts Here -----------------------------------------> if classifier_name == "WSD-SentiWordNet": r = re.compile("[,.?()\\d]+ *") lines_list = r.split(sentence) pos, neg = self.wsd_helper.call_classifier(lines_list) normalize_wsd = pos + neg + 1 pos = pos/normalize_wsd neg = neg/normalize_wsd #print abs(pos - neg) #print "Results from WSD SentiWordNet on %s Corpus "%domain decision = self.prepare_results(pos, neg, classifier_name, domain) results.append({ "classifier" : classifier_name, "result" : decision, "pos_score" : pos, "neg_score" : neg }) #WSD Hue ENDS Here -----------------------------------------> return results def prepare_results(self ,pos, neg, classifier_name ,domain): print "Results from %s on %s Corpus" % (classifier_name, domain) if abs(pos - neg) <= HARD_THRESHOLD:# and neg is not 0 and pos is not 0: print " Text is Neutral/Hard To Classify" print ' Positive = %s , Negative = %s' % (pos, neg) return HARD_TO_CLASSIFY elif pos > neg: print " Text is POSITIVE" print ' Positive = %s Negative = %s' % (pos, neg) return POSITIVE elif pos < neg: print " Text is NEGATIVE" print ' Positive = %s Negative = %s' % (pos, neg) return NEGATIVE else: return HARD_TO_CLASSIFY def prepare_textblob_results(self , textblob_polarity): print "Results from TextBlob pattern" polarity = textblob_polarity print ' Polarity %s' % polarity if -0.25 <= polarity <= 0.25: print " Text is Neutral/Hard To Classify" return HARD_TO_CLASSIFY elif -0.25 > polarity: print " Text is NEGATIVE" return NEGATIVE elif polarity > 0.25: print " Text is POSITIVE" return POSITIVE else: return HARD_TO_CLASSIFY def load_classifiers(self ,classifier_name = "MaxentClassifier" , domain_name = "tweets"): pickled_classifier = 'classifier-%s.%s.pickle' % (classifier_name, domain_name) pickle_dir = self.ab_path +'/'+'Data/Pickles/%s/%s' % (domain_name ,pickled_classifier) if not os.path.exists(pickle_dir): return None classifier = pickle.load(open(pickle_dir, 'r')) return classifier def set_classifiers(self ,classifier_names = "MaxentClassifier" , domains = "tweets"): print "Loading classfiers..." req_classifiers = classifier_names.split() req_domains = domains.split() loaded_classifiers = {} for classifier_name in req_classifiers: if classifier_name == "WSD-SentiWordNet": continue for domain_name in req_domains: classifeir = self.load_classifiers(classifier_name, domain_name) if classifeir is None: continue classifier_key_name = classifier_name+"_"+domain_name loaded_classifiers[classifier_key_name] = classifeir print "Classifier %s loaded!..." %(classifier_key_name) return loaded_classifiers