コード例 #1
0
def train_data(ticker):

    df = pd.read_csv('../tmp/training_data/' + ticker + '2015-2016_data1.csv')
    train_df = df[['snippet', 'price change']]
    print "Training News Dataset"
    print train_df.head(5)
    cl = NaiveBayesClassifier(train_df.as_matrix(columns=None))

    df = pd.read_csv('../tmp/training_data/' + ticker + '2016-2017_data1.csv')
    dataset = df[['snippet', 'price change']]

    classified = []
    right = 0
    #print dataset.head(n=5)
    print "\nClassifying dataset\n"
    for index, row in dataset.iterrows():
        classified.append(cl.classify(row[0]))
        right += 1 if row[1] == classified[index] else 0

    dataset['News Sent.'] = classified
    path = '../tmp/results/News/' + ticker + '_results.csv'
    dataset.to_csv(path, encoding='utf-8', index=False)
    #dataset['Price Sent.'] = real_sent
    print dataset[['snippet', 'price change', 'News Sent.']].head(n=20)
    total = len(dataset['snippet'])
    print "\nCalculating "
    print "\nRight %d, Total %d" % (right, total)
    print "Correct percentage %.2f %%" % ((1.0 * right / total) * 100)
    #print cl.classify(dataset.as_matrix(columns=None))
    print cl.show_informative_features(10)
コード例 #2
0
def generateClassifier():
    train = getIntentDataset()

    cl = NaiveBayesClassifier(train)
    cl.show_informative_features(5)    
    path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/QueryAnalyzer/Models/"
    saveTrainedClassifier(path, cl, "intent_classifier_2.pickle")
コード例 #3
0
def generateIntentionalityClassifier():
    db = dbClient()    
    training = db.training
    cursor = training.find()    
    
    #Reducir la cantidad de registros 
    crs = list(cursor)    
    random.shuffle(crs)
    # split into 90% training and 10% test sets
    p = int(len(crs) * .01)
    cr_test = crs[0:p]        
        
    print "Test", len(cr_test)    
    
    data = []
    t = ""
    for td in cr_test:
        tgram = td["triGram"]
        label = td["label"] 
        #print tgram
        for tg in tgram:
            d = '-'.join(tg)
            t = t + " " + d
        #print t
        data.append((t, label))
        t = ""
    #print data
    cl = NaiveBayesClassifier(data)
    cl.show_informative_features(30)    
    path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/QueryAnalyzer/Models/"
    saveTrainedClassifier(path, cl, "my_classifier_v6.pickle")
    return cl
コード例 #4
0
def create_sentiment():
    """
        Train sentiment model and save.

        Input type: None 
        Output: Model as pickle 
    """

    random.seed(1)

    test = [
        ("The dude presenting Unravel seems like one of the most genuine game developers Ive ever seen I really hope this game works out for him",'pos'),
        ("His hands are shaking Dude looks so stoked and scared at the same time",'pos'),
        ("Right I just felt like I was watching his dream come true It was nice The game looks very well done as well Good for him",'pos'),
        ("Seriously Unravel looks really good actually and honestly seeing him so happy about what hes made is contagious I want to see more of Unravel ",'pos'),
        ("He was so nervous shaking all over his voice quivering",'neg'),
        ("The game looked nice too very cute art style ",'pos'),
        ("You could tell he genuinely wanted to be there it looked like he was even shaking from the excitement  I hope it works out for them aswell",'pos'),
        ("However following that up with the weird PvZ thing was odd To say the least",'neg'),
        ("Haha The game did look nice though Im definitely going to keep an eye on it I enjoy supporting such hopeful developers",'pos'),
        ("Very personable This looks like a buy for me As a dev in a other sector I appreciate this passion",'pos'),
        ("I want to give him a cookie",'pos'),
        ("Im getting a copy Im gonna support my indie devs",'pos'),
        ("The twitch leak was accurate It was like a play by play you start speaking French then switch to English",'neg'),
        ("yep exactly what i was thinking lol its important to note that the twitch leak never had them saying it was Dishonored 2 but that they were honored to be here very different",'neg'),
        ("Honored  Im 100 sure that was intentional",'neg'),
        ("oh yea for sure but wasnt solid enough evidence imo to be like dishonored 2 confirmed just based off that",'neg'),
        ("The confirmation was who was talking not what they were talking about ",'neg'),
        ("How awkward is it for a pop singer to perform at a video game conference",'neg'),
        ("Oh god did they warn him that he will get zero reaction",'neg'),
        ("I really hope so",'pos'),
        ("Almost as bad as Aisha f*****g up her dialogue constantly Shes doing alright though E3 is really becoming a mainstream media event Hollywood has nothing like this ComicCon is the only comparison and they dont dazzle it up like E3",'neg')
        ]


    # Grab review data
    reviews = [
        (list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)
        ]
    random.shuffle(reviews)

    # Divide into 10% train/test splits
    new_train, new_test = reviews[:1900], reviews[1900:]

    # Train the NB classifier on the train split
    cl = NaiveBayesClassifier(new_train)

    # Compute accuracy
    accuracy = cl.accuracy(test + new_test)
    print("Accuracy: {0}".format(accuracy))

    # Show 5 most informative features
    cl.show_informative_features(5)

    # Save model for use in creating social model sentiment
    with open('sentiment_clf_full.pkl', 'wb') as pk:
        pickle.dump(cl, pk)
    print 'done saving model'
コード例 #5
0
ファイル: sentiment.py プロジェクト: ayhun/MMDS
class NaiveBayesAnalyzer:
    cl = None

    def __init__(self):
        with open("training_data.json", "r") as f:
            self.cl = NaiveBayesClassifier(f, format="json")
        self.cl.show_informative_features(20)

    def analyze(self, text):
        return self.cl.classify(text)
コード例 #6
0
    def train(self):
        """
        Train a classifier dependeding on the data
        """
        train = []

        for intent in self.data["intents"]:
            for pattern in intent["patterns"]:
                train.append((pattern, intent["label"]))

        pprint.pprint(train)
        cl = NaiveBayesClassifier(train)
        print('Accuracity: ', cl.accuracy(train))
        cl.show_informative_features(5)  
        return cl
コード例 #7
0
def enginemongo(text):
    from textblob.classifiers import NaiveBayesClassifier
    trainingset = db.trainingset.find()
    tsarr = []
    for t in trainingset:
        tsarr.append((t["question"], t["answer"]))

    print(tsarr)
    cl = NaiveBayesClassifier(tsarr)
    prob_dist = cl.prob_classify(text)
    print("TEST:", text, " ", prob_dist, " ", prob_dist.max())
    maxprob = 0
    maxanswer = ""
    for a in prob_dist.samples():
        pd = round(prob_dist.prob(a), 2)
        if (pd > maxprob):
            maxprob = pd
            maxanswer = a
        print(a, ":", round(prob_dist.prob(a), 2))
    print(cl.show_informative_features())
    print("RISPOSTA:", maxanswer, " --- ", maxprob)
    aa = cl.extract_features(text)
    print(aa)
    print("---------------------------------------")
    return {"answer_key": maxanswer, "answer_prob": maxprob}
コード例 #8
0
ファイル: language_detector.py プロジェクト: MARS87/ieor242
class LanguageDetector(object):
    def __init__(self, train=SAMPLE_TRAIN, feature_extractor=FeatureExtractors.last_word_extractor()):
        self.train = train
        self.classifier = NaiveBayesClassifier(self.train, feature_extractor)
    
    def accuracy(self, test_set=SAMPLE_TEST):
        return self.classifier.accuracy(test_set)

    def show_features(self):
        return self.classifier.show_informative_features(5)
def train_NBC(filepath):
	new_df = read_data(filepath)
	new_train_test = new_df.values.tolist()
	x_train, x_test = train_test_split(new_train_test, test_size=0.1)
	
	cl = NaiveBayesClassifier(x_train)
	# print(cl.classify("Please create an assignment and forward it by EOD"))
	# print(cl.classify("Im not a dessert person but the warm butter cake should be illegal its so good."))
	
	print("Acheived a test accuracy of : %s " % cl.accuracy(x_test))
	
	# details of classifier train
	cl.show_informative_features()
	
	if not os.path.isdir("./models"):
		os.mkdir("./models")
	# saving the trained model
	file = open("./models/cl_NBC.obj", "wb")
	pickle.dump(cl, file)
	file.close()
コード例 #10
0
ファイル: 2_textBlob_Model.py プロジェクト: MLayne1/Sentanal
def main():
    print("Running!")
    # train textblob NaiveBayesClassifier
    with open(SRC_TRAIN, encoding='utf-8', mode='r') as train:
        cl = NaiveBayesClassifier(train, format="json")
        cl.show_informative_features(10)

    # classify each article in the test data
    with open(SRC_TEST, encoding='utf-8') as test:

        #load json to a json object
        articles = json.load(test)
        print("to classify: " + str(len(articles)))

        # iterate through articles
        count = 0
        for article in articles:
            count += 1

            givenLabel = cl.classify(article['text'])
            trueLabel = article['label']

            correct = measure(givenLabel, trueLabel)

            print(
                str(count) + " Classified:" + givenLabel + " Label:" +
                trueLabel + (" correct" if correct else " wrong"))

    accuracy = (nTruePositive + nTrueNegative) / (
        nTruePositive + nTrueNegative + nFalsePositive + nFalseNegative)
    fMeasure = (2 * nTruePositive) / (
        (2 * nTruePositive) + nFalsePositive + nFalseNegative)

    print("accuracy: {0}".format(accuracy))
    print("F1-Score: {0}".format(fMeasure))

    print("TP: {0} FP: {1} TN: {2} FN: {3}".format(nTruePositive,
                                                   nFalsePositive,
                                                   nTrueNegative,
                                                   nFalseNegative))
コード例 #11
0
class XcLassify(object):
    """TODO"""
    def __init__(self):
        self.__cl = None
        self.__traindata = None
        self.__testdata = None

    def _fetch_clean(self, filepath):
        """TODO"""
        dframe = pandas.read_excel(filepath)
        dframe.iloc[:, 0] = dframe.iloc[:, 0].map(clean_str)
        dframe.iloc[:, 0] = dframe.iloc[:, 0].map(anycode)
        return dframe.iloc[:, 0:2].to_records(index=False).tolist()

    def _split_data(self, datalist, test_ratio):
        """TODO"""
        self.__traindata, self.__testdata = train_test_split(datalist, test_size=test_ratio)
        return self.__traindata, self.__testdata

    def data_from_excel(self, filepath, test_ratio=0.24):
        datalist = self._fetch_clean(filepath)
        return self._split_data(datalist, 0.2)

    def train(self, update=False, new_data=None):
        """TODO"""
        if update and new_data:
            self.__cl.update(new_data)
        else:
            self.__cl = NaiveBayesClassifier(self.__traindata)

    def classify(self, text):
        """TODO"""
        text = clean_str(text, post_func=anycode)
        return self.__cl.classify(text)

    def benchmark(self, show_best_features=False):
        """TODO"""
        print('\nAccuracy: %0.3f\n' % self.__cl.accuracy(self.__testdata))
        if show_best_features:
            self.__cl.show_informative_features()
コード例 #12
0
ファイル: twebit.py プロジェクト: securitywarrior/twebit
def main():
    auth = OAuthHandler(consumerKey, consumerSecret)
    auth.set_access_token(accessToken, accessSecret)
    train = []
    test = []
    getData('data/pos.txt', 'pos',train) #get data from txt file
    getData('data/neg.txt', 'neg',train)
    getData('data/neut.txt', 'neut',train)
    getData('data/test/testNeut.txt', 'neut',test)
    getData('data/test/testPos.txt', 'pos', test)
    getData('data/test/testNeg.txt', 'neg', test)

    global cl
    cl = NaiveBayesClassifier(train)

    while True: # get tweets from twitter
        cl.show_informative_features(5)
        print(cl.accuracy(test))
        twitterStream = Stream(auth, listener())
        twitterStream.filter(track=["bitcoin"], async = True, stall_warnings=True)
        time.sleep(6000) # check
        twitterStream.disconnect()
コード例 #13
0
def main(argv=0):
    nBObj = naiveBayes()
    businessId = nBObj.deriveBusinessId('yelp_academic_dataset_business.json')
    print len(businessId)
    businessId = businessId[:10]
    train = nBObj.getTrainData('yelp_academic_dataset_review.json', businessId)

    print train
    cl = NaiveBayesClassifier(train)

    print cl.show_informative_features(20)
    print "Opening the file..."
    target = open("naiveBayesResult.txt", 'w')

    for (sentence, rating) in nBObj.testSentences:
        clOutput = nBObj.testSentence(sentence, cl)
        strToWrite = str(rating) + "\t" + clOutput
        target.write(strToWrite)
        target.write("\n")

    target.close()
    nBObj.calcAccuracy()
コード例 #14
0
def main(argv=0):
    nBObj = naiveBayes()
    businessId = nBObj.deriveBusinessId('yelp_academic_dataset_business.json')
    print len(businessId)
    businessId = businessId[:10]
    train = nBObj.getTrainData('yelp_academic_dataset_review.json',businessId)
 
    print train
    cl = NaiveBayesClassifier(train)

    print cl.show_informative_features(20) 
    print "Opening the file..."
    target = open("naiveBayesResult.txt", 'w')

    for (sentence,rating) in nBObj.testSentences:
        clOutput = nBObj.testSentence(sentence,cl)
        strToWrite = str(rating) + "\t" + clOutput
        target.write(strToWrite)
        target.write("\n")
      
    target.close()
    nBObj.calcAccuracy()
コード例 #15
0
def create_sentiment_model():

    random.seed(1)

    # Grab some movie review data
    reviews = [(list(movie_reviews.words(fileid)), category)
                  for category in movie_reviews.categories()
                  for fileid in movie_reviews.fileids(category)]
    random.shuffle(reviews)
    new_train, new_test = reviews[:1900], reviews[1900:]

    cl = NaiveBayesClassifier(new_train)

    # Compute accuracy
    accuracy = cl.accuracy(new_test)
    print("Accuracy: {0}".format(accuracy))

    # Show 5 most informative features
    print cl.show_informative_features(5)

    with open('sentiment_clf_full.pkl', 'wb') as pk:
        dill.dump(cl, pk)
    print 'done saving model'
def create_sentiment_model():

    random.seed(1)

    # Grab some movie review data
    reviews = [(list(movie_reviews.words(fileid)), category)
               for category in movie_reviews.categories()
               for fileid in movie_reviews.fileids(category)]
    random.shuffle(reviews)
    new_train, new_test = reviews[:1900], reviews[1900:]

    cl = NaiveBayesClassifier(new_train)

    # Compute accuracy
    accuracy = cl.accuracy(new_test)
    print("Accuracy: {0}".format(accuracy))

    # Show 5 most informative features
    print cl.show_informative_features(5)

    with open('sentiment_clf_full.pkl', 'wb') as pk:
        dill.dump(cl, pk)
    print 'done saving model'
コード例 #17
0
def train():

    try:
        print("Training has begun")
        trainingData = createTrainingData()
        #print trainingData
        classifier = NaiveBayesClassifier(trainingData)
        print(classifier.show_informative_features())
        with open('threat_classifierOntology.pkl', 'wb') as output:
            pickle.dump(classifier, output, pickle.HIGHEST_PROTOCOL)
        print("Training has completed")
    except:
        now = time.strftime("%x")
        file = open("traingErrors.txt", "w")
        error = sys.exc_info()[0].__name__ + ': ' + str(sys.exc_info()[1])
        file.write(now)
        file.write('\n')
        file.write(error)
        file.close()
コード例 #18
0
def engine(text):
    from textblob.classifiers import NaiveBayesClassifier
    from textblob.classifiers import MaxEntClassifier
    from textblob.classifiers import NLTKClassifier
    url_train = "https://"
    file_train = "train.csv"
    if not (os.path.isfile(file_train)):
        with open(file_train, 'wb') as handle:
            print("Train loaded from Request:", url_train)
            response = requests.get(url_train, stream=True)
            if not response.ok:
                # Something went wrong
                pass
            for block in response.iter_content(1024):
                handle.write(block)
            handle.close()
            print("Request DONE")
    else:
        print("Train loaded from cache:", file_train)

    with open(file_train, 'r', encoding="utf8") as fp:
        #cl = MaxEntClassifier(fp)

        cl = NaiveBayesClassifier(fp)

    # print(cl.classify("This is an amazing library!"))
    # print(cl.accuracy(test))
    # cl.update(test)
    # print(cl.accuracy(test))

    prob_dist = cl.prob_classify(text)
    print("TEST:", text, " ", prob_dist, " ", prob_dist.max())
    for a in prob_dist.samples():
        print(a, ":", round(prob_dist.prob(a), 2))
    print(cl.show_informative_features())
    aa = cl.extract_features(text)
    print(aa)
    print("---------------------------------------")

    return cl.classify(text)
コード例 #19
0
ファイル: test_classifiers.py プロジェクト: shipci/TextBlob
class TestNaiveBayesClassifier(unittest.TestCase):
    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text),
                     basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(
            ["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                     self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        cl = NaiveBayesClassifier(CSV_FILE, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(CSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        cl = NaiveBayesClassifier(JSON_FILE, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(JSON_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_accuracy_on_a_csv_file(self):
        a = self.classifier.accuracy(CSV_FILE)
        assert_true(isinstance(a, float))

    def test_accuracy_on_json_file(self):
        a = self.classifier.accuracy(JSON_FILE)
        assert_true(isinstance(a, float))

    def test_init_with_tsv_file(self):
        cl = NaiveBayesClassifier(TSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
                      lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(
            repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(
                len(train_set)))
コード例 #20
0
import pickle

from textblob.classifiers import NaiveBayesClassifier
from nltk.corpus import twitter_samples

pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

# positive tweets words list
pos_tweets_set = []
for tweet in pos_tweets:
    pos_tweets_set.append((tweet, 'pos'))

# negative tweets words list
neg_tweets_set = []
for tweet in neg_tweets:
    neg_tweets_set.append((tweet, 'neg'))

# random.shuffle(pos_tweets_set)
# random.shuffle(neg_tweets_set)

test_set = pos_tweets_set[:1000] + neg_tweets_set[:10000]
train_set = pos_tweets_set[1000:2000] + neg_tweets_set[1000:2000]

__NaiveBayesClassifier = NaiveBayesClassifier(train_set)
print("Accuracy: {}".format(__NaiveBayesClassifier.accuracy(test_set)))
print(__NaiveBayesClassifier.show_informative_features(10))

# save model for later use
pickle.dump(__NaiveBayesClassifier, open("naivebayes.pickle", "wb"))
コード例 #21
0
ファイル: classifier.py プロジェクト: duneding/gensory
#tx_cl = "I feel amazing!"
#tx_prob = "This one's a doozy."
tx_cl = "El subte esta demorado"
tx_prob = "El subte funciona bien"

cl = NaiveBayesClassifier(train)
print cl.classify(tx_cl)
print cl.classify("El subte funciona bien")
prob_dist = cl.prob_classify(tx_prob)
print prob_dist.max()
print round(prob_dist.prob("pos"), 2)
print round(prob_dist.prob("neg"), 2)

print cl.accuracy(data_sets.en_test)
print cl.show_informative_features(5)

#Using TextBlob
blob = TextBlob("No funca por que hay obras para mejorar la cosa", classifier=cl)
print blob.sentiment
print blob.classify()

blob = TextBlob("El subte funciona normal", classifier=cl)
print blob.sentiment
print blob.classify()

blob = TextBlob("Se realizan obras en el subte A", classifier=cl)
print blob.sentiment
print blob.classify()

blob = TextBlob("No funciona, anda averiguar por que. Quizas hay un accidente", classifier=cl)
コード例 #22
0
for decider in deciders:
    decider = 'un/data/' + decider
    with open(decider) as f_input:
        decider_files.append(f_input.read())

new_decider = []
for speech in decider_files:
    sentences = sent_tokenize(speech)
    for sentence in sentences:
        new_decider.append(sentence)

# create classifier
train, test = train_test_split(speeches, test_size=0.1)

train_set = list(train.itertuples(index=False, name=None))
test_set = list(test.itertuples(index=False, name=None))

my_classifier = NaiveBayesClassifier(train_set)

#try out the deciders
my_classifier.classify(new_decider[0])
prob_dist = my_classifier.prob_classify(new_decider[1])
print(prob_dist.max())
print(round(prob_dist.prob('SOV'), 2))
print(round(prob_dist.prob('OPN'), 2))

#test accuracy
accuracy = my_classifier.accuracy(test_set)
print(accuracy)
informative = my_classifier.show_informative_features(5)
コード例 #23
0
test_data = [
    ("Fluggastdatenspeicherung: EU-Parlament votiert für PNR-Datenbank"),
    ("Chipmaschinen-Hersteller: ASML liefert sechs EUV-Belichtungsmaschinen an Intel aus"),
    ("Apple: iCloud löscht unter Umständen Daten unwiederbringlich"),
    ("Spionagesoftware: Hacking Team nutzt UEFI-Rootkit"),
    ("Mobilfunk: 5G soll für Nutzer wie ein unbegrenztes System sein"),
    ("Mobilfunknetzbetreiber: Kostenloses WLAN für Regionalzüge kommt"),
    ("Kickstarter: Kerze lädt Smartphone"),
    ("Hacking Team: Carabinieri kapern mal kurz das Internet"),
    ("Nach Hackerangriff: OPM-Chefin Katherine Archuleta tritt zurück"),
    ("Smartphone-Hersteller: Geeksphone hört auf"),
    ("Systemverschlüsselung: Yubikeys Zwei-Faktor-Authentifizierung unter Linux nutzen"),
    ("Kritik an Dieter Nuhr: Wir alle sind der Shitstorm"),
    ("Navigationsgerät: Autofahrer verursacht wegen Navi schweren Unfall"),
    ("Until Dawn angespielt: Das Horrorhaus der tödlichen Entscheidungen"),
    ("Satoru Iwata: Nintendo-Chef im Alter von 55 Jahren gestorben"),
    ("Call of Duty: Zombies à la Film noir")

    ]

nbc = NaiveBayesClassifier(train_data, lang='de_DE')

for data in test_data:
    print(nbc.classify(data))

print(nbc.accuracy(train_data))
print(nbc.show_informative_features(5))



コード例 #24
0
# preproccess training and testing datasets
# without preprocessing, classification will typically take longer and have lower accuracy
print("Preprocessing datasets...")
print()
sys.stdout.flush()
traindata = preprocessData(traindata, minwordlen=4)
testdata = preprocessData(testdata, minwordlen=4)

# train the Naive Bayes Classifier
print("Training Naive Bayes Classifier...")
print()
sys.stdout.flush()
nbc = NaiveBayesClassifier(traindata)

# show the most informative features used for classification
nbc.show_informative_features(5)
print()
sys.stdout.flush()

# test the Naive Bayes Classifier
print("Testing Naive Bayes Classifier...")
sys.stdout.flush()
acc = nbc.accuracy(testdata)
print("Accuracy:", round(acc, 4))
print()

# print the confusion matrix
print("Printing Confusion Matrix...")
print()
sys.stdout.flush()
コード例 #25
0
print('Training models...')
neg_ids = nltk.corpus.movie_reviews.fileids('neg')
pos_ids = nltk.corpus.movie_reviews.fileids('pos')
neg_feats = [(NaiveBayesAnalyzer().feature_extractor(
    nltk.corpus.movie_reviews.words(fileids=[f])), 'neg') for f in neg_ids]
pos_feats = [(NaiveBayesAnalyzer().feature_extractor(
    nltk.corpus.movie_reviews.words(fileids=[f])), 'pos') for f in pos_ids]

# split into train and test
#train_manual = data[:69]
train_manual = neg_m_data[105:] + pos_m_data[105:]
train_mrc = neg_feats + pos_feats  #+ train_manual
#test_data  = data[69:]
test_data = neg_m_data[:105] + pos_m_data[:105]

# create model
print('Testing models...')
cl = NaiveBayesClassifier(train_manual)
cl_2 = NaiveBayesClassifier(train_mrc)

# calculate score
score = round(cl.accuracy(test_data) * 100, 3)
score_2 = round(cl_2.accuracy(test_data) * 100, 3)
print('Classifier 1 (EmotionPix) is', str(score) + '% accurate.')
print('Informative Features:')
print(cl.show_informative_features(10))
print('Classifier 2 (NaiveBayes w/ movie review corpus) is',
      str(score_2) + '% accurate.')
print('Informative Features:')
print(cl_2.show_informative_features(10))
コード例 #26
0
ファイル: test_classifiers.py プロジェクト: Arttii/TextBlob
class TestNaiveBayesClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                        self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        cl = NaiveBayesClassifier(CSV_FILE, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(CSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        cl = NaiveBayesClassifier(JSON_FILE, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(JSON_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_accuracy_on_a_csv_file(self):
        a = self.classifier.accuracy(CSV_FILE)
        assert_true(isinstance(a, float))

    def test_accuracy_on_json_file(self):
        a = self.classifier.accuracy(JSON_FILE)
        assert_true(isinstance(a, float))

    def test_init_with_tsv_file(self):
        cl = NaiveBayesClassifier(TSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
            lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
コード例 #27
0
			for word in words:
				if word not in stopwords.words() and not word.isdigit():
					list_tuples.append((word.lower(),tabsep[0]))
			c+=1
			if c==500:
				break
	return list_tuples
print 'importing data...'
a = time.time()
entire_data = get_list_tuples("dataset.txt")
print "It took "+str(time.time()-a)+" seconds to import data"
print 'data imported'
random.seed(1)
random.shuffle(entire_data)
train = entire_data[:750]
test = entire_data[751:1500]
print 'training data'
a = time.time()
cl = NaiveBayesClassifier(train)
print "It took "+str(time.time()-a)+" seconds to train data"
print 'data trained, now checking accuracy:'
accuracy = cl.accuracy(test)
print "accuracy: "+str(accuracy)
cl.show_informative_features(5)

x = ""
while (x != "exit"):
	x = raw_input("enter a email to check if it is a spam email or not , type exit to exit \n")
	print cl.classify(x)
	
コード例 #28
0
def encode_tweet(tweet):                             #remove links,username and symbols form tweet
    tweet_words = []
    words = tweet[0].split()
    for x in words:
        x = unicode(x, errors='ignore')
    tweet_words.append(x)
    mod_tweet=" ".join(tweet_words)
    tweet[0] = mod_tweet
    train_tweets.append(tweet)
    if counter > 100:                                  #training and testing dataset
        test_tweets.append(tweet)
    else:
        train_tweets.append(tweet)

with open("tweets1.csv",'rb') as data_file:
	data = csv.reader(data_file,delimiter=',')
	for tweet in data:
		encode_tweet(tweet)
		counter+=1

classifier = NaiveBayesClassifier(train_tweets)
print("Accuracy of the classifier: {0}".format(classifier.accuracy(test_tweets)))
classifier.show_informative_features(10)

print "Training complete"

test = raw_input("Enter the string:")
if classifier.classify(test)==0:
    print "Sentiment: negative"
else:
    print "Sentiment: positive"
コード例 #29
0
#
# w= Word('running')
# print w.lemmatize()

#Text Classify

train = [('I love this sandwich.', 'pos'),
         ('this is an amazing place!', 'pos'),
         ('I feel very good about these beers.', 'pos'),
         ('this is my best work.', 'pos'), ("what an awesome view", 'pos'),
         ('I do not like this restaurant', 'neg'),
         ('I am tired of this stuff.', 'neg'),
         ("I can't deal with this", 'neg'), ('he is my sworn enemy!', 'neg'),
         ('my boss is horrible.', 'neg')]

test = [('the beer was good.', 'pos'), ('I do not enjoy my job', 'neg'),
        ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'),
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg')]

cl = NaiveBayesClassifier(train)
print cl.classify("This is an amazing library!")
print cl.accuracy(test)

print cl.show_informative_features(5)

prob_dist = cl.prob_classify("This one's a doozy.")
print

prob_dist.max()
コード例 #30
0
print(c.classify('seu lindo'))  # A

print('labels:', c.labels())  # labels: ['A', 'B']

test = [('Voce e muito gato', 'A'), ('Voce e muito feio', 'B')]

print('acuracia:', c.accuracy(test))  # acuracia: 0.5

test = [('Voce e muito lindo', 'A'), ('Voce e muito feio', 'B')]

print('acuracia:', c.accuracy(test))  # acuracia: 1.0

print('features:', c.extract_features('Eu sou horroroso'))
# features: {'contains(Eu)': True, 'contains(sou)': True, 'contains(lindo)': False, 'contains(feio)': False}

c.show_informative_features()
# Most Informative Features
#            contains(sou) = True                B : A      =      1.0 : 1.0
#             contains(Eu) = True                B : A      =      1.0 : 1.0

#
# So que o pacote textblob eh mais do que
# classificacao de texto. Vejamos.
#
from textblob import TextBlob
text = TextBlob(
    "I went home. Because I'm happy. Clap along if you feel like a room without a roof."
)

print('text:', text)
# text: I went home. Because I'm happy. Clap along if you feel like a room without a roof.
コード例 #31
0
    data = read_target_data()

    print("Predicting Sentiment...")
    # predict sentiment on data's text
    data[['pred', 'p_pos',
          'p_neg']] = data.text.apply(lambda x: pd.Series(get_sentiment(x)))

    print("Calculating Ratios...")
    # use predicted sentiment to fit a dummy model
    # allowing us to get pos:neg ratios
    dummy_train = data[['text', 'pred']].values.tolist()
    dummy_cl = NaiveBayesClassifier(dummy_train)

    # shove the ratio structure into a file
    sys.stdout = open('./data/mif.txt', 'w')
    dummy_cl.show_informative_features(100)
    sys.stdout = sys.__stdout__

    print("Preparing Report...")
    # parse the raw ratios file and create dataframe
    with open("./data/mif.txt") as f:
        mif = f.read().split('\n')[1:]
        mif_df = pd.DataFrame([parse_mif(i) for i in mif])

    # create percentages from ratios
    N = (mif_df.neg + mif_df.pos)
    mif_df['pct_neg'] = mif_df.neg / N
    mif_df['pct_pos'] = mif_df.pos / N

    # save data sorted by the ratios
    mif_df.dropna().sort_values("high", ascending=False).head(100).to_csv(
コード例 #32
0
    except UnicodeDecodeError:
        continue

    test.append([post_body, i_file])

Bayes = NaiveBayesClassifier(train)

print os.getcwd()

pos = []
neg = []
for body in test:

    judge = Bayes.classify(body[0])
    if judge == "positive":
        call(['mv', "./" + body[1], "john/"])
        os.getcwd()
    if judge == "negative":
        call(['mv', "./" + body[1], "non_john/"])

os.mkdir("hard_to_classify")
remaining = glob.glob("*.html")
for doc in remaining:

    call(['mv', "./" + doc, "hard_to_classify/"])

#print Bayes.accuracy(test)
print Bayes.show_informative_features(10)

# #advanced feature extraction - slang and misspellings
コード例 #33
0
print(cl.classify("This is an amazing library!"))
# get the label probability distribution
prob_dist = cl.prob_classify("This one's a doozy.")
print(prob_dist.max())
print(round(prob_dist.prob("pos"), 2))
print(round(prob_dist.prob("neg"), 2))
# classifying textblob
blob = TextBlob("The beer is good. But the hangover is horrible.",
                classifier=cl)
print(blob.classify())
for s in blob.sentences:
    print(s)
    print(s.classify())
# evaluating classifiers
print(cl.accuracy(test))
print(cl.show_informative_features(
    5))  # displaying a listing of the most informative features
# updating classifiers wth new data
new_data = [('She is my best friend.', 'pos'),
            ("I'm happy to have a new friend.", 'pos'),
            ("Stay thirsty, my friend.", 'pos'),
            ("He ain't from around here.", 'neg')]
print(cl.update(new_data))
print(cl.accuracy(test))


# feature extractors
# creating a feature extractor that just uses the first and last words of a document as its features
def end_word_extractor(document):
    tokens = document.split()
    first_word, last_word = tokens[0], tokens[-1]
    feats = {}
コード例 #34
0
class TestNaiveBayesClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                        self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_custom_format(self):
        redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')]

        class MockRedisFormat(formats.BaseFormat):
            def __init__(self, client, port):
                self.client = client
                self.port = port

            @classmethod
            def detect(cls, stream):
                return True

            def to_iterable(self):
                return redis_train

        formats.register('redis', MockRedisFormat)
        mock_redis = mock.Mock()
        cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234)
        assert_equal(cl.train_set, redis_train)

    def test_data_with_no_available_format(self):
        mock_fp = mock.Mock()
        mock_fp.read.return_value = ''

        assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp))

    def test_accuracy_on_a_csv_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_accuracy_on_json_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_init_with_tsv_file(self):
        with open(TSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
            lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
コード例 #35
0
class LogicProc:
    def __init__(self, preclassified_file, channel, slack_token):

        if os.path.isfile(preclassified_file)==False:
            print('"' + preclassified_file + '" does not exist!')

        with open(preclassified_file,'r') as train_set:   
            print 'training from ' + preclassified_file
            self.spam_classifier = NaiveBayesClassifier(train_set, format="csv")

        self.slack_client = slack_interface.SlackInterface(slack_token)
        self.message_queue = []
        self.last_message_ts = None
        self.channel = self.slack_client.get_channel_id(channel)
        if self.channel==None:
            print 'Could not find channel ' + channel
        self.db_interface = database_interface.DB()

        training = self.db_interface.get_training_data()
        self.spam_classifier.update(training)
        self.update_classifer_from_slack(self.channel)

        self.spam_classifier.show_informative_features()

        self.check_twitter_msgs = infinite_timer.InfiniteTimer(5.0, self.proc_messages)
        self.check_slack_msgs = infinite_timer.InfiniteTimer(60.0, self.update_classifer_from_slack, self.channel)
        self.check_twitter_msgs.start()
        self.check_slack_msgs.start()


    def add_new_message(self, msg, source):
        """
        Callback from Twitter when there is a new message
        @param msg     The Twitter message, with all its attributes
        @param source  Where the message came from.  Right now should only be 'twitter'
        """
        self.message_queue.append({'source': source, 'message': msg})

    def proc_messages(self):
        for msg in self.message_queue:
            if msg['source'] == 'twitter':
                message = msg['message']
                if self.quality_filter(message.text) == True:
                    print 'GOOD: ' + message.text.encode('utf-8')
                    self.post_to_slack(message, self.channel)
                    self.store_message(message.text, True)
                else:
                    print 'BAD: ' + message.text.encode('utf-8')
                    self.store_message(message.text, False)
            self.message_queue.remove(msg)

    def run_loop(self):
        """
         Not sure what this was originally intended to do..
         now it runs proc_messages once a second
        """
        while True:
           # sleep between polling queue
           time.sleep(1)

    def quality_filter(self, message_text):
        # -filter useless hashtag announcements "Prayers for Irma! Use #IrmaSoS"
        # -filter outside the geobounds
        # -filter duplicates
        # -bayesian filter
        result = self.spam_classifier.classify(message_text)
        if result == 'neg':
            return False
        else:
            return True

    def post_to_slack(self, msg, channel):
        self.slack_client.post_message(msg.text, channel)

    def update_classifer_from_slack(self, channel):
        slack_msgs = self.slack_client.get_slack_reactions(channel, self.last_message_ts)
        if len(slack_msgs)>0:
            self.last_message_ts = slack_msgs[-1]['ts']
        bayesian_update_data = []
        for m in slack_msgs:
            user_feedback = self.is_slack_reaction_pos(m['reactions'])
            text = m['text']
            if user_feedback == None:
                pass
            elif user_feedback == True:
                bayesian_update_data.append((text, 'pos'))
            elif user_feedback == False:
                bayesian_update_data.append((text, 'neg'))
        # update for better results if we can
        if len(bayesian_update_data) > 0:
          print 'updating db...'
          # update classification in DB
          self.db_interface.update(bayesian_update_data);
          # update classifier
          print 'updating classifier...'
          self.spam_classifier.update(bayesian_update_data)
          print 'done...'
          self.spam_classifier.show_informative_features()

    def is_slack_reaction_pos(self,reactions):
        for t in reactions:
           name = t['name']
           if name == '-1':
               return False
           if name == '+1':
               return True
        return None


    def store_message(self, message, filter_classification, source='twitter'):
        self.db_interface.add(message,filter_classification, source)
        
    def bayesian_search(self, query):
        results = self.api.search(query)
        filtered_results = [r for r in results if self.is_spam(r.text) == 0]
        return filtered_results
コード例 #36
0
        continue

    test.append([post_body, i_file])


Bayes = NaiveBayesClassifier(train)

print os.getcwd()

pos = []
neg = []
for body in test:

    judge = Bayes.classify(body[0])
    if judge == "positive":
        call(["mv", "./" + body[1], "john/"])
        os.getcwd()
    if judge == "negative":
        call(["mv", "./" + body[1], "non_john/"])

os.mkdir("hard_to_classify")
remaining = glob.glob("*.html")
for doc in remaining:

    call(["mv", "./" + doc, "hard_to_classify/"])

# print Bayes.accuracy(test)
print Bayes.show_informative_features(10)

# #advanced feature extraction - slang and misspellings
コード例 #37
0
ファイル: NaiveBayes.py プロジェクト: sricsrin/Text-Analytics
# preproccess training and testing datasets
# without preprocessing, classification will typically take longer and have lower accuracy  
print ("Preprocessing datasets...")
print ()
sys.stdout.flush()
traindata = preprocessData(traindata, minwordlen=4)
testdata = preprocessData(testdata, minwordlen=4)

# train the Naive Bayes Classifier
print ("Training Naive Bayes Classifier...")
print ()
sys.stdout.flush()
nbc = NaiveBayesClassifier(traindata)

# show the most informative features used for classification
nbc.show_informative_features(5)
print ()
sys.stdout.flush()

# test the Naive Bayes Classifier
print ("Testing Naive Bayes Classifier...")
sys.stdout.flush()
acc = nbc.accuracy(testdata)
print ("Accuracy:", round (acc, 4))
print ()

# print the confusion matrix
print ("Printing Confusion Matrix...")
print ()
sys.stdout.flush()
コード例 #38
0
"Korban diajak tersangka ke musala di dekat pondok. Saat kondisi sepi dan hanya berdua dengan korban, tersangka mencabuli korban," kata Wahyu kepada wartawan, Minggu (20/3/2016).

Lantaran menganggap Nurul sebagai Gus, korban pun tak berani menolak permintaan tersangka. Terlebih lagi, tersangka membujuk korban bahwa perbuatan cabul itu untuk memasukkan ilmu kebatinan ke tubuh korban.

"Tersangka berdalih untuk mengajari korban ilmu tasawuf. Nyatanya itu hanya untuk memuluskan niat tersangka agar bisa mencabuli korban," ungkapnya.

Menurut Wahyu, perbuatan cabul itu dilakukan tersangka kepada korban berulang kali selama 2 tahun terakhir. Bahkan korban diminta membayar uang kepada tersangka setiap kali usai melakukan pencabulan. Nilainya antara Rp 200.000 hingga jutaan rupiah.

"Tersangka juga meminta uang dari korban berulang kali. Total kerugian korban Rp 40 juta," sebutnya.

Tak tahan dengan perbuatan Nurul, lanjut Wahyu, korban pun memutuskan buka mulut ke teman sesama santri. Mendapat dukungan dari teman-temannya, korban memberanikan diri melapor ke Polres Jombang, Kamis (17/3).

Pada hari yang sama, polisi memutuskan menjebak tersangka. "Saat korban menyerahkan uang yang terakhir kepada tersangka, saat itu tersangka langsung kami tangkap," jelasnya.

Akibat perbuatannya, kini Nurul harus mendekam di Rutan Polres Jombang. Tersangka dijerat dengan Pasal 80 ayat (1) juncto Pasal 82 ayat (1) UU RI No 35 Tahun 2014 tentang Perlindungan Anak dengan ancaman pidana maksimal 15 tahun penjara.

"Kalau ada yang merasa menjadi korban perbuatan tersangka ini, jangan malu melapor, akan kami jaga identitasnya. Karena itu bisa memberatkan tersangka," pungkasnya. """

tic = timeit.default_timer()
renum = ''.join([i for i in text if not i.isdigit()])
text = stem_words(renum)
print("text diatas setelah diklasifikasi yaitu %s\n" % cl.classify(text))
toc = timeit.default_timer()
print ("waktu klasifikasi : ")
print(toc-tic)

print(cl.show_informative_features(20))
# classifier = TextBlob(stemstop_output, classifier=cl)
# print(classifier.classify())

コード例 #39
0
df = pd.DataFrame({"labels": trainLabels, "trainData": trainData})

train, test = train_test_split(df, test_size = 0.25)

training = zip(train["trainData"].tolist() , train["labels"].tolist())
testing = zip(test["trainData"].tolist() , test["labels"].tolist())

## training model
%time model = NBC(training)

%time print(model.accuracy(training))
## getting accuracy of 90%

## Shows important features for detecting intent
print model.show_informative_features()
#Most Informative Features
#        contains(please) = True              Yes : No     =     10.0 : 1.0
#            contains(ve) = True               No : Yes    =      9.9 : 1.0
#        contains(verifi) = True              Yes : No     =      9.3 : 1.0
#          contains(sale) = True               No : Yes    =      9.3 : 1.0
#        contains(moment) = True              Yes : No     =      8.6 : 1.0
#       contains(compani) = True               No : Yes    =      7.3 : 1.0
#      contains(deliveri) = True               No : Yes    =      6.9 : 1.0
#    contains(unsubscrib) = True               No : Yes    =      6.9 : 1.0
#        contains(experi) = True               No : Yes    =      6.9 : 1.0
#         contains(remov) = True               No : Yes    =      6.9 : 1.0

%time print(model.accuracy(testing))
## getting accuracy of 70.3%
コード例 #40
0
ファイル: activeLearning.py プロジェクト: kobauman/signature
def activeLearning(NAME, datapath, infile, iterations = 3, portion = 10):
    logger = logging.getLogger('signature.activeLearning')
    logger.info('Active learning model building')
    
    #load data
    review_file = open(infile,"r")
    
    #convert to appropriate format
    review_corpus = list()
    for i, line in enumerate(review_file):
        try:
            #filter out non-ascii simbols
            review = json.loads(line)
            review_corpus.append([re.sub(r'[^\x00-\x7f]', r' ', review['text']), review['textFeatures']])
        except:
            logger.error(review['text'])
            continue
    review_file.close()
    logger.info('Data converted - %d reviews'%len(review_corpus))
    
    
    #Shuffle dataset
    #random.seed(1)
    random.shuffle(review_corpus)
    
    try:
        current_train = json.loads(open(datapath + '%s_current_train.json'%NAME,'r').read())
    except:
        current_train = list()
    
    for t in current_train:
        try:
            review_corpus.remove(t[0])
        except:
            pass
    
    logger.info("Len(current_train) = %d"%len(current_train))
    
    '''
    Prepare first portion
    '''
    if len(current_train) > 10:
        #train model
        cl = NaiveBayesClassifier(current_train, feature_extractor=feature_extractor)
        
        #prepare next portion
        ratio = float(sum([int(x[1] == 'g') for x in current_train]))/len(current_train)
        #ratio = 0.5
        logger.info('ratio = %.3f\nclassifying train set ...'%ratio)
        train_classify = [[0.1*random.random() + abs(int(cl.classify(t)=='s')-ratio),t] for t in review_corpus[:1000]]
        train_classify.sort()
        reviews_portion = train_classify[:portion]
    
    else:
        reviews_portion = [y for y in enumerate(review_corpus[:portion])]

    
    '''
    main iterations of active learning
    '''
    for iteration in range(iterations):
        #ask for labels
        for p in range(len(reviews_portion)):
            var = input('''\n\n%s \n(%f)\nPlease give the label to the review 
(g - generic / s - specific): '''%(reviews_portion[p][1][0],reviews_portion[p][0]))
            
            if var.lower().startswith('g'):
                label = 'g'
            elif var.lower().startswith('s'):
                label = 's'
            elif var.lower().startswith('x'):
                logger.info('Finish')
                break
            else:
                logger.info('Bad label')
                continue
        
            #prepare train set
            current_train.append((reviews_portion[p][1],label))
            review_corpus.remove(reviews_portion[p][1])
        
        #train model
        cl = NaiveBayesClassifier(current_train, feature_extractor=feature_extractor)
        
        #prepare next portion
        ratio = float(sum([int(x[1] == 'g') for x in current_train]))/len(current_train)
        #ratio = 0.5
        logger.info('ratio = %.3f\nclassifying train set ...'%ratio)
        train_classify = [[0.1*random.random() + abs(int(cl.classify(t)=='s')-ratio),t] for t in review_corpus[:1000]]
        train_classify.sort()
        reviews_portion = train_classify[:portion]
        
        logger.info('Iteration: %d (%d items), Accuracy on train = %.2f'%(iteration,len(current_train),100*cl.accuracy(current_train)))
        
        current_train_out = open(datapath+'%s_current_train.json'%NAME,'w')
        current_train_out.write(json.dumps(current_train))
        current_train_out.close()
        
    
    cl.show_informative_features(10)
    
    
    
    
    #test
    random.shuffle(current_train)
    thres = int(0.8*len(current_train))
    train_self = current_train[:thres]
    test_self = current_train[thres:]
    cl_test =  NaiveBayesClassifier(train_self, feature_extractor=feature_extractor)
    acc_str = 'Accuracy on test = %.2f with %d items in testset and %d items in trainset'%(100*cl_test.accuracy(test_self),
                                                                                           len(test_self),len(train_self))
    logger.info(acc_str)
    message = list()
    message.append(acc_str)
        
    #saving model
    pickle.dump(cl, open(datapath+ '%s_active_learning.model'%NAME, "wb" ) )
    
    
    return '\n'.join(message)
コード例 #41
0
def extractor(word):
    feats = {}
    last_letter = word[-1]
    feats["last_letter({0})".format(last_letter)] = True
    return feats


if __name__ == "__main__":
    # customDicts = {'./texts/wordsEn.txt':'english','./texts/wordsEs.txt':'spanish','./texts/wordsEs2.txt':'spanish'}
    """ customDicts = {'./texts/wordsEn.txt':'english','./texts/wordsEs2.txt':'spanish'} 
	for customDictFilename, customDictLang in customDicts.items(): 
		currentDict = open(customDictFilename,'r') 
		for line in currentDict: 
			wordTrain = (line.replace('\r','').replace('\n',''),customDictLang) 
			train.append(wordTrain) 
		currentDict.close() """
    # print train
    lang_detector = NaiveBayesClassifier(train, feature_extractor=extractor)
    #  lang_detector = NaiveBayesClassifier(train)
    print lang_detector.accuracy(test)
    lang_detector.show_informative_features(5)
    while 1:
        try:
            line = sys.stdin.readline()
            # print line
            print lang_detector.classify(line)
        except KeyboardInterrupt:
            break
        if not line:
            break
コード例 #42
0
blob = TextBlob("The beer is good. But the hangover is horrible.",
                classifier=class1)
blob.classify()

for s in blob.sentences:
    print(s)
    print(s.classify())

# ### Evaluating Classifiers

class1.accuracy(test)

# ### Diplay a Listing of the Most Informative Features

class1.show_informative_features(5)

# ### Updating Classifiers with New Data¶

new_data = [('She is my best friend.', 'pos'),
            ("I'm happy to have a new friend.", 'pos'),
            ("Stay thirsty, my friend.", 'pos'),
            ("He ain't from around here.", 'neg')]
class1.update(new_data)

class1.accuracy(test)

# ### Feature Extractors


def end_word_extractor(document):
コード例 #43
0
ファイル: classify.py プロジェクト: benhoff/vexparser
                         'what are you working on',
                         'what you making')

experience_utterances = [(x, 'experience') for x in experience_utterances]
environment_utterances = [(x, 'enivornment') for x in environment_utterances]
working_on_utterances = [(x, 'working') for x in working_on_utterances]

# FIXME: find better way to flatten lists together
training_set = []
training_set.extend(experience_utterances)
training_set.extend(environment_utterances)
training_set.extend(working_on_utterances)


classifier = NaiveBayesClassifier(training_set)
print(classifier.show_informative_features(), classifier.labels())

bogus_utterances = (
        'if you going to use nltk u may want to check this out spacy .io',
        'sup people? I see the weather\'s getting better over there, Ben.',
        'i had the same problem your having so thats my i made my own.',
        'try http, instead of https'
        )

# TODO: Figure out how to make this stronger
dual_utterance = ('how long have you been coding and what IDE do you use',)

test_utterances = ('what are you making',
                   'hey that nyancat is cool, how do you get that?')

for t in test_utterances:
コード例 #44
0
nltk.download('punkt')
nltk.download('brown')

#Version Check#

print(pd.__version__)
print(tb.__version__)
print(nltk.__version__)

#Data Cleanse and Shuffle#

train = pd.read_csv("training.txt", sep="\t", header=None)
train = train.sample(frac=1, random_state=int(input("Random State? ")))
train.columns = ["Sentiment", "Raw_Text"]
train_data = list(zip(train["Raw_Text"], train["Sentiment"]))

#Train Classifier#

cl = NaiveBayesClassifier(train_data[:1000])

#Test Classifier#

train["Guess"] = train["Raw_Text"].apply(cl.classify)

#Results#

Accuracy = round(
    (train[train["Sentiment"] == train["Guess"]].size / train.size) * 100, 2)
print(f"\nAccuracy: {Accuracy}%\n")
print(cl.show_informative_features())
コード例 #45
0
    for line in f:
        line = line.replace("\n", "")
        a, b = line.split('|')
        train.append((a, b))
    f.close()
    return train


data = get_csv(file_name[cur_file], must_contain[cur_file])
text = get_text(data)

# train model
if train_enable:
    train = get_train_set(file_name[cur_file])
    cl = NaiveBayesClassifier(train)
    cl.show_informative_features(1000)
    if test_enable:
        data['sentiment'] = 0.0
        for index, row in data.iterrows():
            #print(row)
            #print(cl.prob_classify(row.review_body).prob("pos"))
            val1 = TextBlob(str(row.review_body)).sentiment.polarity
            val2 = (cl.prob_classify(str(row.review_body)).prob("pos") - 0.5) // 0.5
            data.at[index, 'sentiment'] = 0.3*val1 + 0.7*val2
            #data.at[index, 'sentiment'] = TextBlob(str(row.review_body)).sentiment.polarity
            #data.at[index, 'sentiment'] = round(cl.prob_classify(str(row.review_body)).prob("pos"), 5)
        data.to_csv(file_name[cur_file]+'_sentiment.csv')

if word_cloud_enable:
    f = open(file_name[cur_file]+'_feature.txt', 'r')
    pos_words = {}
コード例 #46
0
Created on Tue Jul 11 09:47:27 2017

@author: mzent
"""
from textblob.classifiers import NaiveBayesClassifier
from textblob import TextBlob
import csv
import pickle

train = []
test = []
with open('trainingData5.csv', newline='', encoding="latin-1") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        train.append((row[0] + ": " + row[2], row[3]))

with open('testData5.csv', newline='', encoding="latin-1") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        test.append((row[0] + ": " + row[2], row[3]))

print("read data")
cl = NaiveBayesClassifier(train)
print("created classifier")
# Compute accuracy
print("Accuracy: {0}".format(cl.accuracy(test)))

# Show 5 most informative features
cl.show_informative_features(25)

print(pickle.dump(cl))
コード例 #47
0
    ("I need shelter", "shelter"), ("I need shelter", "shelter"),
    ("The hospitals here are all full", "healthcare"),
    ("I need shelter", "shelter"), ("I need healthcare", "healthcare"),
    ("I am concerned about my children's meantal well being and health, the earthquake has caused big stress",
     "healthcare")
]

# BASIC TESTING FRAMEWORK USING NAIVE BAYES CLASSIFIER MODEL
print("Beginning training set")
classy = NaiveBayesClassifier(train)
print("Training set ended")

print("Beginning testing set")
totalCorrect = 0
totalTestPoints = len(test)
for testPoint in test:
    print("Checking: {}".format(testPoint))
    print("Focus on shelter probability: {}".format(
        classy.prob_classify(testPoint[0]).prob("shelter")))
    print("Focus on healthcare probability {}".format(
        classy.prob_classify(testPoint[0]).prob("healthcare")))
    prediction = classy.classify(testPoint[0])
    if (prediction == testPoint[1]):
        totalCorrect += 1
print()
print("=================")
print()
print("Overall Test Accuracy: {0:.2f}%".format(
    (totalCorrect / totalTestPoints) * 100))
classy.show_informative_features(8)
コード例 #48
0
def SA():
    r = Rake()
    # Opens file and reads in training data
    # NB classifier trains using the read in data
    with open("datasets/trainingData.csv", 'r') as trainingdata:
        classifier = NaiveBayesClassifier(trainingdata, format="csv")
        print("Training Data")
        classifier.show_informative_features(15)

    # Opens file and reads in testing data
    # Prints testing data accuracy
    # Not needed for final product

    with open("datasets/testingData.csv", 'r') as testingdata:
        print("Testing data accuracy", classifier.accuracy(testingdata))

    # Asks for user input
    userInput = input("Please provide a test input: ")

    # Removes all non letter characters
    regex = re.compile('[^a-zA-Z ]')
    punctuationRemoved = regex.sub('', userInput)
    print("Punctuation removed: ", punctuationRemoved)

    # Defines stopwords
    stop_words = set(stopwords.words('english'))

    # Takes user input, removes stopwords
    word_tokens = word_tokenize(punctuationRemoved)

    # Creates list size based on number of words left after stop words are removed
    filtered_sentence = [w for w in word_tokens if not w in stop_words]

    # Initialize empty list
    filtered_sentence = []

    # Appends each word to end of list
    # Runs for as many words are stored in word_tokens
    for w in word_tokens:
        # If word is not in stop_words, append to end of list
        if w not in stop_words:
            filtered_sentence.append(w)

    # Prints list to see new sentence with stopwords removed
    print("Stopwords removed: ", filtered_sentence)

    # Converts the filtered stop word sentence to string
    stringWithoutStopwords = ' '.join(
        [str(elem) for elem in filtered_sentence])

    # Extracts keywords from the filtered sentence
    r.extract_keywords_from_text(stringWithoutStopwords)

    # Ranks the keywords that have been extracted
    ranked_phrases = r.get_ranked_phrases()

    print("Keywords extracted: ", ranked_phrases)

    # Converts extracted keywords list to string
    listToStr = ' '.join([str(elem) for elem in ranked_phrases])

    # Runs string through trained NB classifier
    finalString = TextBlob(listToStr, classifier=classifier)

    # Print string followed by classification
    print("String followed by classification: ", finalString,
          finalString.classify())
    if finalString.classify() == ("pos"):
        binaryClassify = 1
    else:
        binaryClassify = 0

    print(binaryClassify)
コード例 #49
0
]

if __name__ == "__main__":
    # print "Initiallizing classifier... (training...)"
    # train_positive()
    # train_negative()
    # print train_set
    # classifier = NaiveBayesClassifier(train_set)

    # with open('./texts/words.txt', 'r') as fp:
    #     classifier = NaiveBayesClassifier(fp, format="csv")
    #     print classifier.accuracy(test_set)
    #     print classifier.show_informative_features()

    classifier = NaiveBayesClassifier(train_set)
    print train_set
    print classifier.accuracy(test_set)
    print classifier.show_informative_features()
    print "Ready "
    while 1:
        try:
            line = sys.stdin.readline()
            prob_dist = classifier.prob_classify(line.lower())
            print prob_dist.max()
            print "PROB POS: " + str(round(prob_dist.prob("pos"), 2))
            print "PROB NEG: " + str(round(prob_dist.prob("neg"), 2))
        except KeyboardInterrupt:
            break
        if not line:
            break