def train(samples_proportion=0.7):
    global words_in_ham, ham_word_count, words_in_spam, spam_word_count, raw_ham_prob, raw_spam_prob

    ham, spam = read_spam_ham()

    print("Spam size: " + str(len(spam)) + " Ham size: " + str(len(ham)))

    all_emails = append_ham_and_spam(ham, spam)

    random.shuffle(all_emails)

    print('Corpus size = ' + str(len(all_emails)) + ' emails')

    features = [(Preprocessor.get_features(email, ' '), label)
                for (email, label) in all_emails]

    print('Collected ' + str(len(features)) + ' feature sets')
    '''
    # define Support value in %
    support = 10
    spam_support_count = (spam_size * 10) / 100;
    ham_support_count = (ham_size * 10) / 100;
    print('Spam support count:' + str(spam_support_count))
    print('Ham support count:' + str(ham_support_count))
    # get the spam frequent itemset and ham frequent itemset
    # spam_frequent, ham_frequent = get_frequent(all_features, spam_support_count, ham_support_count)
    # train the our own naivebayes classifier and collect dictionary of raw probabilities of words
    '''

    train_size = int(len(features) * samples_proportion)

    train_set, test_set = features[:train_size], features[train_size:]

    ham_mail_count, spam_mail_count = mails_in_ham_spam(train_set)

    spam_prior = 1.0 * spam_mail_count / len(train_set)
    ham_prior = 1.0 * ham_mail_count / len(train_set)

    words_in_ham, words_in_spam = frequency_in_ham_spam(train_set)

    spam_vocab = len(spam_word_count)
    ham_vocab = len(ham_word_count)

    t = get_probabilities_in_each_class(ham_prior, words_in_ham, ham_vocab,
                                        ham_word_count, raw_ham_prob,
                                        raw_spam_prob, spam_prior,
                                        words_in_spam, spam_vocab,
                                        spam_word_count, test_set, train_set)

    ham_prior, words_in_ham, ham_vocab, raw_ham_prob, raw_spam_prob, spam_prior, words_in_spam, spam_vocab, test_set, train_set = get_parameters(
        t)
    #print("Train Size:" + str(len(train_set)) + str(' Test size:') + str(len(test_set)))

    #evaluate(train_set, test_set, raw_spam_prob, raw_ham_prob, words_in_spam, words_in_ham, spam_vocab, ham_vocab,
    #         spam_prior,
    #         ham_prior)

    classifier = NaiveBayesClassifier(list(spam_word_count),
                                      list(ham_word_count))
    t = classifier.prob_classify(classifier, train_set).max()
Example #2
0
def train(labeled_featuresets, estimator=ELEProbDist):
    """Runs test sentences back through the model to train the model.
	"""
    # Create the P(label) distribution
    label_probdist = esitmator(label_freqdist)

    #Create the P(fval | label, fname) distribution
    feature_probdist = {}

    return NaiveBayesClassifier(label_probdist, feature_probdist)
Example #3
0
    def __init__(self,
                 label_probdist=None,
                 feature_probdist=None,
                 estimator=ELEProbDist):
        """Initialize NBClassifier."""
        self._estimator = estimator

        # in case arguments are specified (ie. when restoring the classifier)
        if all([label_probdist, feature_probdist]):
            self._classifier = NaiveBayesClassifier(
                label_probdist=label_probdist,
                feature_probdist=feature_probdist,
            )
        else:
            self._classifier = None
Example #4
0
   def train(labeled_featuresets, estimator=ELEProbDist):


    # Create the P(label) distribution

       label_probdist = estimator(label_freqdist)



    # Create the P(fval|label, fname) distribution

       feature_probdist = {}



       return NaiveBayesClassifier(label_probdist, feature_probdist)
Example #5
0
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

allWords = []
for w in movie_reviews.words():
	allWords.append(w.lower())

allWords = nltk.FreqDist(allWords)
wordFeatures = list(allWords.keys())[:3000]


def findFeatures(document):
	words = set(document)
	features = {}
	for w in wordFeatures:
		features[w] = (w in words)
	return features
	 

#print((findFeatures(movie_reviews.words('neg/cv000_29416.txt'))))
featureSets = [(findFeatures(rev), category) for (rev, category) in documents ]

new_training_set = featureSets[:100]
testing_set = featureSets[100:]


cl = NaiveBayesClassifier(new_training_set)
print(cl.accuracy(testing_set))


    print("Also see: Hindu Marriage Act")
elif resultc != -1 or y == "Christian":
    f1 = open("Christian.txt")
    f2 = open("christian01.txt")
    l1 = f1.read()
    arr = sent_tokenize(l1)
    l2 = f2.read()
    arr2 = word_tokenize(l2)
    for i in range(0, len(arr)):
        li1.append(tuple((arr[i], arr2[i])))
    f1.close()
    f2.close()
    print("Also see: Indian Divorce Act")
mycase = sys.argv[3]
#mycase=input("enter your case ")
c1 = 0
c2 = 0
model = NaiveBayesClassifier(li1)
#model=nltk.NaiveBayesClassifier.train(li1)
#print(model.classify(mycase))
case = sent_tokenize(mycase)
print(mycase)
for i in range(0, len(case)):
    temp = model.classify(case[i])

    if temp == "0":
        c1 = c1 + 1
    else:
        c2 = c2 + 1
print("Probability of winning case", (c1 / (c1 + c2)) * 100)
Example #8
0
def train(labeled_featuresets, estimator=ELEProbDist):
    label_probdist = estimator(label_freqdist)
    feature_probdist = {}
    return NaiveBayesClassifier(label_probdist, feature_probdist)
Example #9
0
            for word in features:

                if word not in labelled_features:
                    labelled_features[word.lower()] = label_count

                labelled_features[word.lower()][label] += features[word]

            print "Currently at %d distinct tokens and %d papers" % (
                len(labelled_features), samplecount)

    label_probdist = get_label_probdist(labelled_features)

    feature_probdist = get_feature_probdist(labelled_features)

    classifier = NaiveBayesClassifier(label_probdist, feature_probdist)

    for samplefile in test_samples:
        features = {}

        p = PaperParser()
        p.parsePaper(samplefile)

        for sentence in p.extractRawSentences():
            tokens = nltk.word_tokenize(sentence)

            for word in tokens:
                features[word] = True

        dirname = os.path.basename(os.path.dirname(samplefile))
        label = labels[dirname]
Example #10
0
def updateNaiveBayes():
    cl = NaiveBayesClassifier(new_training_set)
    print(cl.accuracy(testing_set))
Example #11
0
def train():

    classifer = NaiveBayesClassifier(training_data)
    f = open('algorithm.pickle', 'wb')
    pickle.dump(classifer, f)
    f.close()
import json
import re
from nltk import NaiveBayesClassifier


def clean_tweet(tweet):
    return ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
               tweet).split())


mydata = []
json_data = open('convertcsv.json', 'r')
data = json.load(json_data)
for d in data:
    if d.get('hate_speech') == 0:
        mydata.append({"text": clean_tweet(d.get('tweet')), "label": "pos"})
    else:
        mydata.append({"text": clean_tweet(d.get('tweet')), "label": "neg"})

cl = NaiveBayesClassifier(mydata, format="json")
cl.classify("This is an amazing library!")