Ejemplo n.º 1
0
def computeNaiveBayes(args, dict_algorithms):
    if (args.debug):
        print("Running naive bayes...", end='')
    model = NaiveBayes(args)
    dict_algorithms["naive_bayes"] = model.compute()
    if (args.debug):
        print("ok!")
Ejemplo n.º 2
0
    def _populate(self, tweets):
        """
        :param tweets: A python dictionary containing trends as keys and list of tweets as
        values against each trend.
        :return: None

        This is a private method used by the constructor to populate the inverted index object
        """
        for trendName in tweets:
            self.trends.append(trendName)
            self.totalTweets += len(tweets[trendName])

            # classify trend
            tweetsDoc = " ".join([tweet.text for tweet in tweets[trendName]])
            model = NaiveBayes()
            model.loadModelFromDB()
            self.categories.append(model.classify(tweetsDoc))

            for tweet in tweets[trendName]:
                if tweet.user.screen_name not in self.twitterHandles:
                    self.twitterHandles.append(tweet.user.screen_name)
                    posts = [(self.trends.index(trendName), tweet)]
                    self.indexLists.append(posts)
                else:
                    posts = self.indexLists[self.twitterHandles.index(
                        tweet.user.screen_name)]
                    posts.append((self.trends.index(trendName), tweet))
        self.logger.debug(
            'Created and populated Inverted Index: Trends-{}, Tweets-{}'.
            format(len(self.trends), self.totalTweets))
Ejemplo n.º 3
0
def evaluate(trainfile, testfile):
    # 訓練データをロード
    trainData = []
    fp = codecs.open(trainfile, "r", "utf-8")
    for line in fp:
        line = line.rstrip()
        temp = line.split()
        trainData.append(temp)
    fp.close()

    # ナイーブベイズを訓練
    nb = NaiveBayes()
    nb.train(trainData)
    print nb

    # テストデータを評価
    hit = 0
    numTest = 0
    fp = codecs.open(testfile, "r", "utf-8")
    for line in fp:
        line = line.rstrip()
        temp = line.split()
        correct = temp[0]  # 正解カテゴリ
        words = temp[1:]  # 文書:単語の集合
        predict = nb.classify(words)  # ナイーブベイズでカテゴリを予測
        if correct == predict:
            hit += 1  # 予測と正解が一致したらヒット!
        numTest += 1
    print "accuracy:", float(hit) / float(numTest)
    fp.close()
Ejemplo n.º 4
0
def main(args):
    """ HamOrSpam entrypoint """

    # Get arguments
    try:
        script, train, test = args
    except ValueError:
        print 'usage: python hamorspam.py train.txt test.txt'
        sys.exit(-1)

    classifier = NaiveBayes(['ham', 'spam'])

    # Train classifier
    train_file = open(train, 'r')
    for line in train_file:
        # Discard empty lines
        if not line.strip():
            continue
        typ, message = line.split('\t', 1)
        classifier.teach(typ, message)
    train_file.close()

    # Query classifier
    test_file = open(test, 'r')
    for line in test_file:
        # Discard empty lines
        if not line.strip():
            continue
        print classifier.classify(line)
    test_file.close()
Ejemplo n.º 5
0
def evaluate(trainfile, testfile):
    # 訓練データをロード
    trainData = []
    fp = codecs.open(trainfile, "r", "utf-8")
    for line in fp:
        line = line.rstrip()
        temp = line.split()
        trainData.append(temp)
    fp.close()
    
    # ナイーブベイズを訓練
    nb = NaiveBayes()
    nb.train(trainData)
    print nb
    
    # テストデータを評価
    hit = 0
    numTest = 0
    fp = codecs.open(testfile, "r", "utf-8")
    for line in fp:
        line = line.rstrip()
        temp = line.split()
        correct = temp[0]    # 正解カテゴリ
        words = temp[1:]     # 文書:単語の集合
        predict = nb.classify(words)  # ナイーブベイズでカテゴリを予測
        if correct == predict:
            hit += 1  # 予測と正解が一致したらヒット!
        numTest += 1
    print "accuracy:", float(hit) / float(numTest)
    fp.close()
def crossValidation(data, N=num, randomize=False):

    if randomize:
        from random import shuffle
        shuffle(data)

    # Cross Validation
    accuracyList = []
    for n in range(N):
        # split train and test data
        trainData = [d for i, d in enumerate(data) if i % N != n]
        testData = [d for i, d in enumerate(data) if i % N == n]

        # train data
        nb = NaiveBayes()
        nb.train(trainData)

        # accuracy of test data
        hit = 0
        numTest = 0
        for d in testData:
            correct = d[0]
            words = d[1:]
            predict = nb.classifier(words)
            if correct == predict:
                hit += 1
            numTest += 1
        accuracy = float(hit) / float(numTest)
        accuracyList.append(accuracy)

    average = sum(accuracyList) / float(N)
    average_f = round(average, 4)
    return average
Ejemplo n.º 7
0
def cross_validate(folds, method):
    if folds < 2:
        print 'Must have at least 2 folds.. evaluating 2-fold cross validation'
        folds = 2
    test_size = 100/folds
    training_size = 100 - test_size
    songs_by_class = split_by_class()
    sentiment_accuracy_sum = 0.0
    emotion_accuracy_sum = 0.0
    for f in range(0,folds):
        test_set = songs_by_class['+'][int(test_size*f):int(test_size+test_size*f)] + songs_by_class['0'][int(test_size*f):int(test_size+test_size*f)] +songs_by_class['-'][int(test_size*f):int(test_size+test_size*f)]
        training_set = songs_by_class['+'][int(test_size+test_size*f):] + songs_by_class['+'][:int(test_size*f)] + songs_by_class['0'][int(test_size+test_size*f):] + songs_by_class['0'][:int(test_size*f)] + songs_by_class['-'][int(test_size+test_size*f):] + songs_by_class['-'][:int(test_size*f)]
        if method == 'nb':
            nb = NaiveBayes()
            nb.train_model(training_set)
            sentiment_accuracy, emotion_accuracy = nb.evaluate_model(test_set, len(training_set))
            emotion_accuracy_sum += emotion_accuracy
            sentiment_accuracy_sum += sentiment_accuracy
        elif method == 'sa':
            sa = SimpleAveraging()
            avgs = sa.train(training_set)
            sentiment_accuracy, emotion_accuracy = sa.evaluate(test_set, avgs)
            emotion_accuracy_sum += emotion_accuracy
            sentiment_accuracy_sum += sentiment_accuracy
        # elif method == 'pool':
        #     pool = AffectPool(NaiveBayes(), SimpleAveraging())
        #     pool.simple_train(training_set)
        elif method =='r':
            nb = NaiveBayes()
            nb.train_model(test_set + training_set)
    print "EMOTION ACCURACY ", emotion_accuracy_sum / folds, " SENTIMENT ACCURACY: ", sentiment_accuracy_sum / folds
Ejemplo n.º 8
0
def test_naivebayes(traindata, testdata):
    #raw pixel feature
    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    for p in range(10, 101, 10):
        print("Training with %d" % int(p * traindata.number * 0.01))
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.orderedout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        print(a)
Ejemplo n.º 9
0
def main():
    url, model = sys.argv[1], sys.argv[2]
 
    classifier = NaiveBayes()
    classifier.load(model)

    page = urlopen(url).read()

    soup = BeautifulSoup(page)
    tags = [tag.name for tag in soup.findAll(True)]
    classification = classifier.classify(tags)

    print("Classified as: %s" % classification)
Ejemplo n.º 10
0
    def test_naivebayes_compare(self):
        basepath = '../resource/'
        naivebayes = NaiveBayes()
        json_data = Serializer.load_json(os.path.join(basepath, 'ocr.json'))
        naivebayes.human_labels = json_data['translate']['country']
        x_list = ['ネツァワル王国', 'カセドリア連合王国', 'ゲブランド帝国', 'ホルデイン王国', 'エルソード王国']

        print(json_data['translate']['country'])
        out = naivebayes.predict_all(x_list)
        for i, y in enumerate(out):
            if x_list[i] != y:
                raise Exception('compare x:{0},predict:{1}'.format(
                    x_list[i], y))
Ejemplo n.º 11
0
class CrossValidation(object):
    def __init__(self):
        self.classifier = NaiveBayes()

    def create_data(self, user_ids):
        data = []
        for category, ids in user_ids.items():
            tweets = get_tweets(ids)
            categories = [category] * len(tweets)
            data += list(zip(tweets, categories))

        np.random.shuffle(data)
        return data

    def split(self, data, test_percentage):
        n_test = int(len(data)*test_percentage)
        n_training = len(data)-n_test

        # unzip (inverse of zip)
        training = zip(*data[:n_training])
        test = zip(*data[n_training:])
        return training, test

    def show_tweets_with_labels(self, tweets, labels):
        for tweet, label in zip(tweets, labels):
            print("{}:\n{}\n".format(label, tweet))

    def evaluate(self, user_ids, test_percentage=0.2, verbose=True):
        """
        user_ids: Twitter IDs separated into categories.
        test_percentage: Ratio of the amount of test data extracted
        from tweets.
        """

        if not(0 <= test_percentage <= 1):
            raise ValueError("test_percentage must be between 0 and 1 "
                             "(inclusive).")

        data = self.create_data(user_ids)
        training, test = self.split(data, test_percentage)

        tweets, categories = training
        self.classifier.fit(tweets, categories)

        tweets, answers = test
        results = self.classifier.predict(tweets)

        if(verbose):
            self.show_tweets_with_labels(tweets, results)

        return results, answers
Ejemplo n.º 12
0
def main(flag=True):
    if flag:
        start = timer()
        # 加载邮件数据的label
        label_df = pd.read_csv("./input/trec06c/full/index_bak",
                               sep=' ..',
                               names=['label', 'filename'])

        for key in label_df['label'].unique():
            print(key, len(label_df[label_df['label'] == key]))

        train, valid = train_test_split(label_df,
                                        test_size=0.2,
                                        random_state=2018)

        normFilelen = train[train['label'] == 'ham'].shape[0]
        spamFilelen = train[train['label'] == 'spam'].shape[0]

        model = NaiveBayes(normFilelen, spamFilelen)
        # model.getStopWords()

        for index, row in tqdm(train.iterrows(), total=train.shape[0]):
            # 将每封邮件出现的词保存在wordsList中
            model.get_word_list('./input/trec06c' + row['filename'],
                                row['label'])
        print('训练集学习完毕,已耗时%2fs' % (timer() - start))

        for index, row in tqdm(valid.iterrows(), total=valid.shape[0]):
            if 'test' in model.wordDict.keys():
                model.wordDict['test'].clear()
            model.get_word_list('./input/trec06c' + row['filename'], 'test')
            wordProbList = model.getTestWords(model.wordDict['test'])
            # 对每封邮件得到的15个词计算贝叶斯概率
            trash_p = model.calBayes(wordProbList)
            if row['label'] == 'spam':
                if trash_p > 0.9:
                    model.validResult['TN'] += 1  # trash
                else:
                    model.validResult['FN'] += 1  # normal
            else:
                if trash_p > 0.9:
                    model.validResult['FP'] += 1  # trash
                else:
                    model.validResult['TP'] += 1  # normal
        model.calMetric()
        print('验证集处理完毕,已耗时%2fs' % (timer() - start))
        pickle.dump(model, open('bayes_model.obj', 'wb'))
    else:
        model = pickle.load(open('bayes_model.obj', 'rb'))
        print("模型加载成功!")
    return model
Ejemplo n.º 13
0
class CrossValidation(object):
    def __init__(self):
        self.classifier = NaiveBayes()

    def create_data(self, user_ids):
        data = []
        for category, ids in user_ids.items():
            tweets = get_tweets(ids)
            categories = [category] * len(tweets)
            data += list(zip(tweets, categories))

        np.random.shuffle(data)
        return data

    def split(self, data, test_percentage):
        n_test = int(len(data) * test_percentage)
        n_training = len(data) - n_test

        # unzip (inverse of zip)
        training = zip(*data[:n_training])
        test = zip(*data[n_training:])
        return training, test

    def show_tweets_with_labels(self, tweets, labels):
        for tweet, label in zip(tweets, labels):
            print("{}:\n{}\n".format(label, tweet))

    def evaluate(self, user_ids, test_percentage=0.2, verbose=True):
        """
        user_ids: Twitter IDs separated into categories.
        test_percentage: Ratio of the amount of test data extracted
        from tweets.
        """

        if not (0 <= test_percentage <= 1):
            raise ValueError("test_percentage must be between 0 and 1 "
                             "(inclusive).")

        data = self.create_data(user_ids)
        training, test = self.split(data, test_percentage)

        tweets, categories = training
        self.classifier.fit(tweets, categories)

        tweets, answers = test
        results = self.classifier.predict(tweets)

        if (verbose):
            self.show_tweets_with_labels(tweets, results)

        return results, answers
Ejemplo n.º 14
0
 def train(self, train_set):
     """Teaches the classifier with labeled data instances."""
     for d in train_set:
         self.corpus.add_doc(d)
     print 'Training on %d documents...\n' % len(train_set)
     if isinstance(self.classifier, NaiveBayes):
         self.classifier.train(self.corpus)
         for c in self.corpus.get_classes():
             if len(c.get_classes()) > 1:
                 subclassifier = NaiveBayes()
                 subclassifier.train(c)
                 self.subclassifiers[c.get_label()] = subclassifier
     else:   # for nltk classifiers
         labeled_feature_set = [(d.get_features(), d.get_labels()[0]) for d in train_set]
         self.classifier.train(labeled_feature_set)  # Sklearn classifiers
Ejemplo n.º 15
0
def dev_train():
    docs = build_doc_set('../papers')
    driver = Processor()
    for d in docs:
        driver.process_document(d)
    driver.clf.set_classifier(NaiveBayes())
    driver.clf.train(docs)
    driver.save_classifier('saved_classifier-367-1')
Ejemplo n.º 16
0
class Classifier(object):
    def __init__(self):
        self.classifier = NaiveBayes()

    def learn_from_tweets(self, user_ids, category):
        """
        Train the classifier by tweets.
        user_ids : A list of twitter ids which their tweets are included
        in the category.
        category : The category of the tweets.
        """
        tweets = get_tweets(user_ids)
        categories = [category] * len(tweets)
        self.classifier.fit(tweets, categories)
        print("Training...")

    def predict_user_input(self):
        """Read user input until 'exit' is entered"""
        sentence = input("input =>")
        while(sentence != 'exit'):
            category = self.classifier.predict_(sentence)
            print("{}\n".format(category))
            sentence = input("input =>")

    def save(self, filename):
        """Save the model."""
        self.classifier.dump_json(filename)

    def load(self, filename):
        """Load the model from a file."""
        self.classifier.load_json(filename)
Ejemplo n.º 17
0
class Classifier(object):
    def __init__(self):
        self.classifier = NaiveBayes()

    def learn_from_tweets(self, user_ids, category):
        """
        Train the classifier by tweets.
        user_ids : A list of twitter ids which their tweets are included
        in the category.
        category : The category of the tweets.
        """
        tweets = get_tweets(user_ids)
        categories = [category] * len(tweets)
        self.classifier.fit(tweets, categories)
        print("Training...")

    def predict_user_input(self):
        """Read user input until 'exit' is entered"""
        sentence = input("input =>")
        while (sentence != 'exit'):
            category = self.classifier.predict_(sentence)
            print("{}\n".format(category))
            sentence = input("input =>")

    def save(self, filename):
        """Save the model."""
        self.classifier.dump_json(filename)

    def load(self, filename):
        """Load the model from a file."""
        self.classifier.load_json(filename)
Ejemplo n.º 18
0
def dev_train_test():
    """Train and test a new classifier on a directory of .txt documents."""
    docs = build_doc_set('../papers')
    print 'Processing docset with %d docs...' % len(docs)
    driver = Processor()
    for d in docs:
        driver.process_document(d)
    driver.clf.set_classifier(NaiveBayes())
    driver.clf.train_and_test(docs, split=.07)
Ejemplo n.º 19
0
 def test_naivebayes_labeling(self):
     naivebayes = NaiveBayes()
     corpus = Serializer.load_csv('../resource/corpus.tsv')
     data = []
     target = []
     for row in corpus:
         data.append(str(row[0]))
         t = int(row[1])
         if t > 5:
             raise Exception(t)
         target.append(t)
     np.array(target, dtype=np.uint8, ndmin=1)
Ejemplo n.º 20
0
class Ranking(object):
    def __init__(self, config):
        self.ocr = OCREngine()
        self.naivebayes = NaiveBayes()
        self.naivebayes.human_labels = self.ocr.settings['translate'][
            'country']

    def create_TemporyFile(self, buffer, verbose=False):
        """
            
            @param {io.BytesIO}buffer
                   {bool}verbose
            @return {string}create file tempory file
        """
        temp_file_name = ''
        with tempfile.NamedTemporaryFile(delete=False) as temp:
            temp.write(buffer.getvalue())
            temp_file_name = temp.name
            if verbose:
                logger.info(temp_file_name)
        return temp_file_name

    def getResult(self, src, save_image=False):
        """
            @param {string} src
                   {bool}save_image output debug image
            @return {OCRDocument} doucument
        """
        pro = DataProcessor(src, ImageType.RAW, save_image=save_image)
        if pro.prepare() is None:
            logger.error('image error:{0}'.format(src))
            return None
        buffer = pro.tobinary(pro.batch())
        temp_file_name = self.create_TemporyFile(buffer, True)

        document = self.ocr.recognize(temp_file_name)
        os.remove(temp_file_name)

        output = '#' + datetime.now().strftime('%F %T.%f')[:-3] + '\n'
        output += '\n'.join(document.names()) + '\n'
        with Serializer.open_stream('../temp/corpus.txt', mode='a') as file:
            file.write(output)

        # ocr corpus data -> NaiveBayes classifier
        # ranking name swap
        change = self.naivebayes.predict_all(document.names())
        #doucument.changeNames(change)

        document.dump()
        return document
def train(instances):
    print('starting training')
    p = None
    if args.algorithm == 'lambda_means':
        p = LambdaMeans(args.cluster_lambda, max_max_index,
                        args.clustering_training_iterations)
        p.train(instances)
    elif args.algorithm == 'nb_clustering':
        p = NaiveBayes(args.num_clusters, max_max_index,
                       args.clustering_training_iterations)
        p.train(instances)

    print('ending training')
    return p
Ejemplo n.º 22
0
def main():

    df = data()

    for train_size in np.linspace(0.5, 0.9, 5):

        train, test = train_test_split(df, train_size=train_size, random_state=42)

        # Since there is only 1 sample with native-country == Holand-Netherlands,
        # ensure that this sample is in the training set
        if "Holand-Netherlands" in test["native-country"].unique():
            train = train.append(test[test["native-country"] == "Holand-Netherlands"])
            test = test[test["native-country"] != "Holand-Netherlands"]

        for ignore_missing in [True, False]:
            nb = NaiveBayes(ignore_missing=ignore_missing)
            nb.learn_parameters(train)
            acc = nb.score(test[test["native-country"] != "Holand-Netherlands"])

            print(
                "\nTrain size: {} Test error: {} Ignore features with missing values: {}".format(
                    train_size, (1 - acc), ignore_missing
                )
            )
Ejemplo n.º 23
0
    def main():
        nb = NaiveBayes()
        nb.load_data_training()
        nb.mulai_training()

        # TODO: [LANGKAH-10] Cobalah untuk melakukan prediksi!
        # Apbila cuacanya 'Hujan', suhunya 'Dingin', tingkat kemalasannya 'tinggi', dan 'Bangun siang',
        # mahasiswanya masuk atau bolos?

        hasil_prediksi = nb.prediksi(nilai_cuaca='Hujan',
                                     nilai_suhu='Dingin',
                                     nilai_tingkat_malas='Tinggi',
                                     nilai_bangun_siang='Ya')
        print('=====================================')

        print('Hasil akhir prediksi = {}, dengan peluang sebesar {}%'.format(
            hasil_prediksi['hasil'], (hasil_prediksi['peluang'] * 100)))
Ejemplo n.º 24
0
def trainTrendClassifier():
    """
    :return: None

    This function instantiates a model of the NaiveBayes class and trains the model on the
    categorized trends data. The trained model is stored in the database for future
    classification purpose.
    """
    logger.debug("trainTrendsClassifier()")
    trainingFolder = config['training']['trends']
    trainingDocs, trainingLabels = getData(trainingFolder)
    logger.debug("documents: " + str(len(trainingDocs)) + ", labels: " +
                 str(len(trainingLabels)))

    model = NaiveBayes()

    model.train(trainingDocs, trainingLabels, stopWordsFlag=True, stem=True)
    model.saveToDB()
Ejemplo n.º 25
0
def main():
    dataset_file, model = sys.argv[1], sys.argv[2]

    classes = load_classes(dataset_file)
    train, test = split_dataset(classes, 0.6)

    classifier = NaiveBayes()
    classifier.train(train)

    accuracy, recall, f1 = classifier.perfomance(test)
    print("Total perfomance")
    print("================")
    print("Accuracy: %f" % accuracy)
    print("Recall: %f" % recall)
    print("F1: %f" % f1)
    print("\n")

    class_accuracy, class_recall, class_f1 = classifier.class_perfomance(test)
    print("Class accuracy")
    print("================")
    for klass in class_accuracy:
        print("%s: %f" % (klass, class_accuracy[klass]))
    print("\n")

    print("Class recall")
    print("================")
    for klass in class_recall:
        print("%s: %f" % (klass, class_recall[klass]))
    print("\n")

    print("Class F1")
    print("================")
    for klass in class_f1:
        print("%s: %f" % (klass, class_f1[klass]))
    print("\n")

    classifier.save(model)
Ejemplo n.º 26
0
        with open("winter-" + classifier + ".json") as json_file:
            for line in json_file:
                json_obj = json.loads(line)
                reviews += [(classifier, json_obj)]

#  Creating model objects
model = args.model
if (model == "baseline"):
    model_obj = BaseLine(reviews, categories)

elif (model == "logreg"):
    model_obj = LogReg(reviews)

elif (model == "multinomialNB"):
    model_obj = NaiveBayes(reviews, "multinomial")

elif (model == "lda"):
    model_obj = TopicModel(reviews)

elif (model == "kNearestNeighbors"):
    model_obj = knn(reviews, target)

else:  # put additional models here.
    print("Argument Error: invalid model specified")
    sys.exit()

model_classified = []  #  classifications stored here
reviews = []  #  resetting reviews list to save memory

#  Reading test data into reviews list
Ejemplo n.º 27
0
##datafile = "../data/weather.nominal.txt"
##pos_class = "play:yes"
##pos_class = "play:no"

##datafile = "haireyescolor.txt"
##pos_class = "Sex:Male"
##pos_class = "Sex:Female"

##datafile = "../data/cmc-full.txt"
##pos_class = "contraceptive-method:none"
##pos_class = "contraceptive-method:long-term"
##pos_class = "contraceptive-method:short-term"

d = Data(datafile)
prnb = NaiveBayes(d)
##prnb = MaxAPost(d)
prnb.train()

pos = 0.0
neg = 0.0

for (v, c_true) in d.test_set:
    if c_true == pos_class:
        pos += 1
    else:
        neg += 1

result_pos = []
result_neg = []
result_dif = []
        df['text'], df['is_spam'], test_size=0.2, random_state=191)

    print('Data set:')
    print('{} total'.format(df.shape[0]))
    for t, t_name in zip(targets, target_names):
        print('{} {}'.format(len(df[df['is_spam'] == t]), t_name))

    print('\nTraining set:')
    print('{} total'.format(len(X_train)))
    for t, t_name in zip(targets, target_names):
        print('{} {}'.format(sum([y == t for y in y_train]), t_name))

    print('\nTest set:')
    print('{} total'.format(len(X_test)))
    for t, t_name in zip(targets, target_names):
        print('{} {}'.format(sum([y == t for y in y_test]), t_name))
    print('')

    # Build Classifier
    gvoc_model = NaiveBayes('General Vocabulary', X_train,
                            y_train, targets, target_names)
    gvoc_model.train()

    gvoc_model.evaluate(X_test, y_test, show_top_features=10)

    rvoc_model = NaiveBayes('Reduced Vocabulary', X_train, y_train, targets,
                            target_names, max_features=200)
    rvoc_model.train()

    rvoc_model.evaluate(X_test, y_test, show_top_features=10)
Ejemplo n.º 29
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

from naivebayes import NaiveBayes


def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


X, y = datasets.make_classification(n_samples=1000,
                                    n_features=10,
                                    n_classes=2,
                                    random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
Ejemplo n.º 30
0
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)


X, y = datasets.make_blobs(n_samples=1000,
                           n_features=2,
                           centers=3,
                           cluster_std=1.0,
                           center_box=(-10.0, 10.0),
                           shuffle=True,
                           random_state=123,
                           return_centers=False)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.9,
                                                    random_state=1234)

clf = NaiveBayes()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(accuracy(y_test, y_pred))

color_map = {0: 'r', 1: 'k', 2: 'g'}

label_color = [color_map[l] for l in y_pred]
plt.scatter(X_test[:, 0], X_test[:, 1], c=label_color)
plt.show()
Ejemplo n.º 31
0
 def __init__(self):
     self.classifier = NaiveBayes()
Ejemplo n.º 32
0
from naivebayes import NaiveBayes

nb = NaiveBayes(3, 3)



dataset = [
([0, 0, 1], 1), 
([0, 1, 0], 0),
([0, 1, 1], 1), 
([1, 0, 0], 0),
([1, 1, 0], 0),
([1, 1, 1], 2), 
([1, 0, 1], 2), 
([0, 1, 1], 1),
([0, 1, 1], 1),
([0, 1, 1], 1),
([0, 1, 1], 1),
([0, 1, 1], 2),
([0, 0, 1], 1),
([1, 0, 1], 2),
([1, 1, 0], 0) ]


for i,t in dataset:
	nb.update(i,t)

print nb.class_count
print nb.feature_count

for i,t in dataset:
Ejemplo n.º 33
0
def evaluate_naivebayes():
    nb = NaiveBayes(sys.argv[1], evaluate = True)
    out = nb.evaluate(sys.argv[2])
    process(out, 'Naive Bayes')
Ejemplo n.º 34
0
def stdmean():
    limit = 0.7
    ratio = 0.8
    times = 5
    print("digit")
    traindata, testdata = dataloader_digit()
    sal = []
    mal = []
    pal = []

    for p in range(10, 101, 10):
        al = []
        il = []
        for i in range(times):
            images, labels = traindata.shuffleout(p)
            pc = Perceptron(traindata.width * traindata.height,
                            traindata.labeldomain)
            pc.train(images, labels, 3, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
    plt.plot(pal, sal, label="digitdata Perceptron std")
    plt.plot(pal, mal, label="digitdata Perceptron mean")

    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    sal = []
    mal = []
    pal = []
    for p in range(10, 101, 10):
        al = []
        for i in range(3):
            images, labels = traindata.shuffleout(p)
            nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
            nb.train(images, labels)
            x = nb.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
        print(a)
    plt.plot(pal, sal, label="digitdata NaiveBayes std")
    plt.plot(pal, mal, label="digitdata NaiveBayes mean")

    sal = []
    mal = []
    pal = []
    for p in range(10, 101, 10):

        al = []
        il = []
        for i in range(times):
            images, labels = traindata.shuffleout(p)
            pc = NeuralNetwork((traindata.width * traindata.height, 15, 15,
                                len(traindata.labeldomain)),
                               traindata.labeldomain)
            pc.train(images, labels, 50, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
        print(a)
    plt.plot(pal, sal, label="digitdata NeuralNetwork std")
    plt.plot(pal, mal, label="digitdata NeuralNetwork mean")

    print("face")
    traindata, testdata = dataloader_face()
    sal = []
    mal = []
    pal = []

    for p in range(10, 101, 10):

        al = []
        il = []
        for i in range(times):
            images, labels = traindata.shuffleout(p)
            pc = Perceptron(traindata.width * traindata.height,
                            traindata.labeldomain)
            pc.train(images, labels, 3, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
    plt.plot(pal, sal, label="facedata Perceptron std")
    plt.plot(pal, mal, label="facedata Perceptron mean")

    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    sal = []
    mal = []
    pal = []
    for p in range(10, 101, 10):
        al = []

        for i in range(3):
            images, labels = traindata.shuffleout(p)
            nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
            nb.train(images, labels)
            x = nb.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
        print(a)
    plt.plot(pal, sal, label="facedata NaiveBayes std")
    plt.plot(pal, mal, label="facedata NaiveBayes mean")

    sal = []
    mal = []
    pal = []
    for p in range(10, 101, 10):

        al = []
        il = []

        for i in range(times):
            images, labels = traindata.shuffleout(p)
            pc = NeuralNetwork((traindata.width * traindata.height, 15, 15,
                                len(traindata.labeldomain)),
                               traindata.labeldomain)
            pc.train(images, labels, 50, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
        print(a)
    plt.plot(pal, sal, label="facedata NeuralNetwork std")
    plt.plot(pal, mal, label="facedata NeuralNetwork mean")

    leg = plt.legend(ncol=1, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.xlabel("data size precentage")
    plt.ylabel("time(in second)")
    plt.show()
Ejemplo n.º 35
0
def naivebayes(trainf, testf):
    nb = NaiveBayes(trainf)
    nb.classify(testf)
Ejemplo n.º 36
0
    from naivebayes import NaiveBayes
    from data import Data

    print_numbers = False

    datafile = "ds/titanicTr.txt"
    pos_class = "Survived:Yes"
    #pos_class = "Survived:No"

    # datafile = "cmcTr.txt"
    # pos_class = "contraceptive-method:none"

    d = Data(datafile)

    prnb = NaiveBayes(d)
    prnb.train()

    r = Roc(prnb, pos_class)

    r.do_curve()

    print "Predicting", pos_class, "for data file", datafile,
    print "with", int(r.curve[2]), "positive instances and", int(
        r.curve[3]), "negative instances"

    if print_numbers:
        prnb.show()

        print "Scores for predicting", pos_class, ":"
        for e in sorted(r.preds):
Ejemplo n.º 37
0
from data import Data
from naivebayes import NaiveBayes

filename = "datasets/weatherNominal.td"
## filename = "datasets/titanic.td"
## filename = "datasets/cmc.td"

d = Data(filename)
d.report()

pr = NaiveBayes(d)
pr.train()
pr.show()

for (v, c_true) in d.test_set:
    c_pred = pr.predict(v)[0]
    print(v, ":")
    print("   ", c_pred, "( true class:", c_true, ")")

##    print(pr.predict(("Class:1st","Sex:Female","Age:Child")))

##    print(pr.predict(("Class:Crew","Sex:Female","Age:Child")))
Ejemplo n.º 38
0
 def __init__(self):
     self.classifier = NaiveBayes()
Ejemplo n.º 39
0
def test_naivebayes_argmax_all():

    traindata, testdata = dataloader_digit()
    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    fal = []
    pal = []
    for p in range(10, 101, 10):
        print("Training with %d" % int(p * traindata.number * 0.01))
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.orderedout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        fal.append(a * 100)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="digitdata order")
    fal = []
    pal = []
    for p in range(10, 101, 10):
        print("Training with %d" % int(p * traindata.number * 0.01))
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.shuffleout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        fal.append(a * 100)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="digitdata random")

    traindata, testdata = dataloader_face()
    feature_domians = [[0, 1]
                       for _ in range(traindata.width * traindata.height)]
    fal = []
    pal = []
    for p in range(10, 101, 10):
        print("Training with %d" % int(p * traindata.number * 0.01))
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.orderedout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        fal.append(a * 100)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="facedata order")
    fal = []
    pal = []
    for p in range(10, 101, 10):
        print("Training with %d" % int(p * traindata.number * 0.01))
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.shuffleout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        fal.append(a * 100)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="facedata random")

    leg = plt.legend(ncol=1, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.xlabel("data size precentage")
    plt.ylabel("accuracy")
    plt.show()
Ejemplo n.º 40
0
def spamHamtoyExample() -> None:
    '''
    Trains a naive bayes classifier using a folder with spam/ham emails
    Checks quality of classifier by using the model to predict the emails from the 'test' folder
    Different feature numbers are used to check how many features gives the best classification score (1-50)
    Plots the classification - x-axis = number of features, y-axis = classification accuracy
    '''
    filedir = '../data/emails/'
    naivebay = NaiveBayes()
    naivebay.train(os.path.join(filedir, 'train/'))

    numOfItemsToPrint = 4
    naivebay.printMostPopularHamWords(numOfItemsToPrint)
    naivebay.printMostPopularSpamWords(numOfItemsToPrint)
    naivebay.printMostindicativeHamWords(numOfItemsToPrint)
    naivebay.printMostindicativeSpamWords(numOfItemsToPrint)

    print('Model logPrior: {}'.format(naivebay.logPrior))
    features = [1, 2, 5, 10, 20, 30, 40, 50]
    accuracy = []
    for i in features:
        acc = naivebay.classifyAndEvaluateAllInFolder(
            os.path.join(filedir, 'test/'), i)
        accuracy.append(acc)
        print(i, "features, classification score:", acc)
    plt.figure("Naive results: #features vs classification error rate")
    plt.plot(features, accuracy)
    plt.grid(True)
    plt.xlabel('Number of Features')
    plt.ylabel('Classification Score')
    plt.show()
Ejemplo n.º 41
0
def timeana():
    import time
    limit = 0.7
    ratio = 1
    times = 200
    print("digit")
    traindata, testdata = dataloader_digit()
    fal = []
    pal = []

    for p in range(20, 101, 10):
        images, labels = traindata.orderedout(p)
        al = []
        il = []
        start = time.time()
        pc = Perceptron(traindata.width * traindata.height,
                        traindata.labeldomain)
        for i in range(times):
            pc.train(images, labels, 1, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
            if a > limit:
                end = time.time()
                break
        fal.append(end - start)
        pal.append(p)
    plt.plot(pal, fal, label="digitdata Perceptron")

    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    fal = []
    pal = []
    for p in range(20, 101, 10):
        start = time.time()
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.orderedout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        end = time.time()
        fal.append(end - start)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="digitdata NaiveBayes")

    fal = []
    pal = []
    for p in range(20, 101, 10):
        images, labels = traindata.orderedout(p)
        al = []
        il = []
        start = time.time()
        pc = NeuralNetwork((traindata.width * traindata.height, 15, 15,
                            len(traindata.labeldomain)), traindata.labeldomain)
        for i in range(times):
            pc.train(images, labels, 1, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
            if a > limit:
                end = time.time()
                break
        fal.append(end - start)
        pal.append(p)
    plt.plot(pal, fal, label="digitdata NeuralNetwork")

    print("face")
    traindata, testdata = dataloader_face()
    fal = []
    pal = []

    for p in range(20, 101, 10):
        images, labels = traindata.orderedout(p)
        al = []
        il = []
        start = time.time()
        pc = Perceptron(traindata.width * traindata.height,
                        traindata.labeldomain)
        for i in range(times):
            pc.train(images, labels, 1, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
            if a > limit:
                end = time.time()
                break
        fal.append(end - start)
        pal.append(p)
    plt.plot(pal, fal, label="facedata Perceptron")

    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    fal = []
    pal = []
    for p in range(20, 101, 10):
        start = time.time()
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.orderedout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        end = time.time()
        fal.append(end - start)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="facedata NaiveBayes")

    fal = []
    pal = []
    for p in range(20, 101, 10):
        images, labels = traindata.orderedout(p)
        al = []
        il = []
        start = time.time()
        pc = NeuralNetwork((traindata.width * traindata.height, 15, 15,
                            len(traindata.labeldomain)), traindata.labeldomain)
        for i in range(times):
            pc.train(images, labels, 1, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
            if a > limit:
                end = time.time()
                break
        fal.append(end - start)
        pal.append(p)
    plt.plot(pal, fal, label="facedata NeuralNetwork")

    leg = plt.legend(ncol=1, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.xlabel("data size precentage")
    plt.ylabel("time(in second)")
    plt.show()
Ejemplo n.º 42
0
from naivebayes import NaiveBayes

from TwitterAPI import TwitterAPI

consumer_key = "A81OeId94VKneudlMgtvseNWK"
consumer_secret = "M1GBfmhLvzSvJAplZgSwjgwsWZkZZh0W8qe7yXetjoGjiJ2HMU"
access_token_key = "4056128655-094VMOircUBKQofYmr1izWx9UQJjZ6XoKDcThJa"
access_token_secret = "0DBWRSO3k7iwWgSds8j7iXntmSDOyMR9T2rcTBk6SGG8H"
api = TwitterAPI(consumer_key, consumer_secret, access_token_key, access_token_secret)

training_set = []

r = api.request('statuses/filter', {'locations':'-74,40,-73,41'})

nb = NaiveBayes()
nb.load()

cin = ''
for item in r:
    print item['text']
    cin = raw_input('Basic? Y/n/quit: ')
    if cin == 'n':
        training_set.append(('non-basic', item['text']))
    elif cin == 'quit':
        break
    else:
        training_set.append(('basic', item['text']))

nb.train(training_set)
nb.save()