def predict(self, text): """ Returns prediction labels of given input text. """ if isinstance(text, str): predictions = NaiveBayesClassifier.classify(self.classifier, self.extract_features(text)) elif isinstance(text, list): predictions = [NaiveBayesClassifier.classify(self.classifier, self.extract_features(email)) for email in text] elif isinstance(text, dict): predictions = collections.OrderedDict({key: NaiveBayesClassifier.classify(self.classifier, self.extract_features(email)) for key, email in text.items()}) return predictions
feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) num_samples = len(train_samples) / 2 for token, counts in labeled_features.items(): for label in ['neg','pos']: feature_freqdist[label, token].inc(True, count=counts[label]) feature_freqdist[label, token].inc(None, num_samples - counts[label]) feature_values[token].add(None) feature_values[token].add(True) for item in feature_freqdist.items(): print item[0],item[1] feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = ELEProbDist(freqdist, bins=len(feature_values[fname])) feature_probdist[label,fname] = probdist return feature_probdist labeled_features = get_labeled_features(train_samples) label_probdist = get_label_probdist(labeled_features) feature_probdist = get_feature_probdist(labeled_features) classifier = NaiveBayesClassifier(label_probdist, feature_probdist) for sample in test_samples: print "%s | %s" % (sample, classifier.classify(gen_bow(sample))) classifier.show_most_informative_features()
os.path.split(os.path.realpath(__file__))[0], ModelConfigReader.getModelFolder(), ModelConfigReader.getModelSavedName(model_name)) print("model path: " + model_path) self.classifier = pickle.load(open(model_path, "rb")) def _mostInformativeFeatures(self, limit=20): return self.classifier.show_most_informative_features(limit) def classify(self, sentence): ''' classify the sentence into postive or negative. It returns classification with probability values. for eg: {'prediction': 'pos', 'pos': 0.582766958567508, 'neg': 0.41723304143249396} ''' probResult = self.classifier.prob_classify(bagOfWords(sentence)) return { "prediction": probResult.max(), "pos": probResult.prob("pos"), "neg": probResult.prob("neg") } # ---------- Testing ---------------------- if __name__ == "__main__": classifier = NaiveBayesClassifier() sentence = "What a fascinating day. I am sure loving the weather." sentence2 = "I don't like this at all. Rich people dominating the value of democracy." print(classifier.classify(sentence)) print(classifier.classify(sentence2))
labelled_features[word.lower()][label] += features[word] print "Currently at %d distinct tokens and %d papers" % ( len(labelled_features), samplecount) label_probdist = get_label_probdist(labelled_features) feature_probdist = get_feature_probdist(labelled_features) classifier = NaiveBayesClassifier(label_probdist, feature_probdist) for samplefile in test_samples: features = {} p = PaperParser() p.parsePaper(samplefile) for sentence in p.extractRawSentences(): tokens = nltk.word_tokenize(sentence) for word in tokens: features[word] = True dirname = os.path.basename(os.path.dirname(samplefile)) label = labels[dirname] print "file: %s | actual: %s | predicted: %s" % ( samplefile, label, classifier.classify(features)) classifier.show_most_informative_features()
import json import re from nltk import NaiveBayesClassifier def clean_tweet(tweet): return ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) mydata = [] json_data = open('convertcsv.json', 'r') data = json.load(json_data) for d in data: if d.get('hate_speech') == 0: mydata.append({"text": clean_tweet(d.get('tweet')), "label": "pos"}) else: mydata.append({"text": clean_tweet(d.get('tweet')), "label": "neg"}) cl = NaiveBayesClassifier(mydata, format="json") cl.classify("This is an amazing library!")