def predict(self, text):
     """
     Returns prediction labels of given input text.
     """
     if isinstance(text, str):
         predictions = NaiveBayesClassifier.classify(self.classifier, self.extract_features(text))
     elif isinstance(text, list):
         predictions = [NaiveBayesClassifier.classify(self.classifier, self.extract_features(email))
                        for email in text]
     elif isinstance(text, dict):
         predictions = collections.OrderedDict({key: NaiveBayesClassifier.classify(self.classifier, self.extract_features(email))
                                                for key, email in text.items()})
     return predictions
Example #2
0
    feature_freqdist = defaultdict(FreqDist)
    feature_values = defaultdict(set)
    num_samples = len(train_samples) / 2
    for token, counts in labeled_features.items():
        for label in ['neg','pos']:
            feature_freqdist[label, token].inc(True, count=counts[label])
            feature_freqdist[label, token].inc(None, num_samples - counts[label])
            feature_values[token].add(None)
            feature_values[token].add(True)
    for item in feature_freqdist.items():
        print item[0],item[1]
    feature_probdist = {}
    for ((label, fname), freqdist) in feature_freqdist.items():
        probdist = ELEProbDist(freqdist, bins=len(feature_values[fname]))
        feature_probdist[label,fname] = probdist
    return feature_probdist


labeled_features = get_labeled_features(train_samples)

label_probdist = get_label_probdist(labeled_features)

feature_probdist = get_feature_probdist(labeled_features)

classifier = NaiveBayesClassifier(label_probdist, feature_probdist)

for sample in test_samples:
    print "%s | %s" % (sample, classifier.classify(gen_bow(sample)))

classifier.show_most_informative_features()
            os.path.split(os.path.realpath(__file__))[0],
            ModelConfigReader.getModelFolder(),
            ModelConfigReader.getModelSavedName(model_name))
        print("model path: " + model_path)
        self.classifier = pickle.load(open(model_path, "rb"))

    def _mostInformativeFeatures(self, limit=20):
        return self.classifier.show_most_informative_features(limit)

    def classify(self, sentence):
        '''
        classify the sentence into postive or negative.
        It returns classification with probability values.
        for eg: {'prediction': 'pos', 'pos': 0.582766958567508, 'neg': 0.41723304143249396}
        '''
        probResult = self.classifier.prob_classify(bagOfWords(sentence))
        return {
            "prediction": probResult.max(),
            "pos": probResult.prob("pos"),
            "neg": probResult.prob("neg")
        }


# ---------- Testing ----------------------
if __name__ == "__main__":
    classifier = NaiveBayesClassifier()
    sentence = "What a fascinating day. I am sure loving the weather."
    sentence2 = "I don't like this at all. Rich people dominating the value of democracy."
    print(classifier.classify(sentence))
    print(classifier.classify(sentence2))
Example #4
0
                labelled_features[word.lower()][label] += features[word]

            print "Currently at %d distinct tokens and %d papers" % (
                len(labelled_features), samplecount)

    label_probdist = get_label_probdist(labelled_features)

    feature_probdist = get_feature_probdist(labelled_features)

    classifier = NaiveBayesClassifier(label_probdist, feature_probdist)

    for samplefile in test_samples:
        features = {}

        p = PaperParser()
        p.parsePaper(samplefile)

        for sentence in p.extractRawSentences():
            tokens = nltk.word_tokenize(sentence)

            for word in tokens:
                features[word] = True

        dirname = os.path.basename(os.path.dirname(samplefile))
        label = labels[dirname]

        print "file: %s | actual: %s | predicted: %s" % (
            samplefile, label, classifier.classify(features))

    classifier.show_most_informative_features()
import json
import re
from nltk import NaiveBayesClassifier


def clean_tweet(tweet):
    return ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
               tweet).split())


mydata = []
json_data = open('convertcsv.json', 'r')
data = json.load(json_data)
for d in data:
    if d.get('hate_speech') == 0:
        mydata.append({"text": clean_tweet(d.get('tweet')), "label": "pos"})
    else:
        mydata.append({"text": clean_tweet(d.get('tweet')), "label": "neg"})

cl = NaiveBayesClassifier(mydata, format="json")
cl.classify("This is an amazing library!")