Beispiel #1
0
def cross_validate(folds, method):
    if folds < 2:
        print 'Must have at least 2 folds.. evaluating 2-fold cross validation'
        folds = 2
    test_size = 100/folds
    training_size = 100 - test_size
    songs_by_class = split_by_class()
    sentiment_accuracy_sum = 0.0
    emotion_accuracy_sum = 0.0
    for f in range(0,folds):
        test_set = songs_by_class['+'][int(test_size*f):int(test_size+test_size*f)] + songs_by_class['0'][int(test_size*f):int(test_size+test_size*f)] +songs_by_class['-'][int(test_size*f):int(test_size+test_size*f)]
        training_set = songs_by_class['+'][int(test_size+test_size*f):] + songs_by_class['+'][:int(test_size*f)] + songs_by_class['0'][int(test_size+test_size*f):] + songs_by_class['0'][:int(test_size*f)] + songs_by_class['-'][int(test_size+test_size*f):] + songs_by_class['-'][:int(test_size*f)]
        if method == 'nb':
            nb = NaiveBayes()
            nb.train_model(training_set)
            sentiment_accuracy, emotion_accuracy = nb.evaluate_model(test_set, len(training_set))
            emotion_accuracy_sum += emotion_accuracy
            sentiment_accuracy_sum += sentiment_accuracy
        elif method == 'sa':
            sa = SimpleAveraging()
            avgs = sa.train(training_set)
            sentiment_accuracy, emotion_accuracy = sa.evaluate(test_set, avgs)
            emotion_accuracy_sum += emotion_accuracy
            sentiment_accuracy_sum += sentiment_accuracy
        # elif method == 'pool':
        #     pool = AffectPool(NaiveBayes(), SimpleAveraging())
        #     pool.simple_train(training_set)
        elif method =='r':
            nb = NaiveBayes()
            nb.train_model(test_set + training_set)
    print "EMOTION ACCURACY ", emotion_accuracy_sum / folds, " SENTIMENT ACCURACY: ", sentiment_accuracy_sum / folds
Beispiel #2
0
def computeNaiveBayes(args, dict_algorithms):
    if (args.debug):
        print("Running naive bayes...", end='')
    model = NaiveBayes(args)
    dict_algorithms["naive_bayes"] = model.compute()
    if (args.debug):
        print("ok!")
Beispiel #3
0
def evaluate(trainfile, testfile):
    # 訓練データをロード
    trainData = []
    fp = codecs.open(trainfile, "r", "utf-8")
    for line in fp:
        line = line.rstrip()
        temp = line.split()
        trainData.append(temp)
    fp.close()
    
    # ナイーブベイズを訓練
    nb = NaiveBayes()
    nb.train(trainData)
    print nb
    
    # テストデータを評価
    hit = 0
    numTest = 0
    fp = codecs.open(testfile, "r", "utf-8")
    for line in fp:
        line = line.rstrip()
        temp = line.split()
        correct = temp[0]    # 正解カテゴリ
        words = temp[1:]     # 文書:単語の集合
        predict = nb.classify(words)  # ナイーブベイズでカテゴリを予測
        if correct == predict:
            hit += 1  # 予測と正解が一致したらヒット!
        numTest += 1
    print "accuracy:", float(hit) / float(numTest)
    fp.close()
Beispiel #4
0
    def _populate(self, tweets):
        """
        :param tweets: A python dictionary containing trends as keys and list of tweets as
        values against each trend.
        :return: None

        This is a private method used by the constructor to populate the inverted index object
        """
        for trendName in tweets:
            self.trends.append(trendName)
            self.totalTweets += len(tweets[trendName])

            # classify trend
            tweetsDoc = " ".join([tweet.text for tweet in tweets[trendName]])
            model = NaiveBayes()
            model.loadModelFromDB()
            self.categories.append(model.classify(tweetsDoc))

            for tweet in tweets[trendName]:
                if tweet.user.screen_name not in self.twitterHandles:
                    self.twitterHandles.append(tweet.user.screen_name)
                    posts = [(self.trends.index(trendName), tweet)]
                    self.indexLists.append(posts)
                else:
                    posts = self.indexLists[self.twitterHandles.index(
                        tweet.user.screen_name)]
                    posts.append((self.trends.index(trendName), tweet))
        self.logger.debug(
            'Created and populated Inverted Index: Trends-{}, Tweets-{}'.
            format(len(self.trends), self.totalTweets))
Beispiel #5
0
def spamHamtoyExample() -> None:
    '''
    Trains a naive bayes classifier using a folder with spam/ham emails
    Checks quality of classifier by using the model to predict the emails from the 'test' folder
    Different feature numbers are used to check how many features gives the best classification score (1-50)
    Plots the classification - x-axis = number of features, y-axis = classification accuracy
    '''
    filedir = '../data/emails/'
    naivebay = NaiveBayes()
    naivebay.train(os.path.join(filedir, 'train/'))

    numOfItemsToPrint = 4
    naivebay.printMostPopularHamWords(numOfItemsToPrint)
    naivebay.printMostPopularSpamWords(numOfItemsToPrint)
    naivebay.printMostindicativeHamWords(numOfItemsToPrint)
    naivebay.printMostindicativeSpamWords(numOfItemsToPrint)

    print('Model logPrior: {}'.format(naivebay.logPrior))
    features = [1, 2, 5, 10, 20, 30, 40, 50]
    accuracy = []
    for i in features:
        acc = naivebay.classifyAndEvaluateAllInFolder(
            os.path.join(filedir, 'test/'), i)
        accuracy.append(acc)
        print(i, "features, classification score:", acc)
    plt.figure("Naive results: #features vs classification error rate")
    plt.plot(features, accuracy)
    plt.grid(True)
    plt.xlabel('Number of Features')
    plt.ylabel('Classification Score')
    plt.show()
def crossValidation(data, N=num, randomize=False):

    if randomize:
        from random import shuffle
        shuffle(data)

    # Cross Validation
    accuracyList = []
    for n in range(N):
        # split train and test data
        trainData = [d for i, d in enumerate(data) if i % N != n]
        testData = [d for i, d in enumerate(data) if i % N == n]

        # train data
        nb = NaiveBayes()
        nb.train(trainData)

        # accuracy of test data
        hit = 0
        numTest = 0
        for d in testData:
            correct = d[0]
            words = d[1:]
            predict = nb.classifier(words)
            if correct == predict:
                hit += 1
            numTest += 1
        accuracy = float(hit) / float(numTest)
        accuracyList.append(accuracy)

    average = sum(accuracyList) / float(N)
    average_f = round(average, 4)
    return average
Beispiel #7
0
def dev_train():
    docs = build_doc_set('../papers')
    driver = Processor()
    for d in docs:
        driver.process_document(d)
    driver.clf.set_classifier(NaiveBayes())
    driver.clf.train(docs)
    driver.save_classifier('saved_classifier-367-1')
Beispiel #8
0
def dev_train_test():
    """Train and test a new classifier on a directory of .txt documents."""
    docs = build_doc_set('../papers')
    print 'Processing docset with %d docs...' % len(docs)
    driver = Processor()
    for d in docs:
        driver.process_document(d)
    driver.clf.set_classifier(NaiveBayes())
    driver.clf.train_and_test(docs, split=.07)
Beispiel #9
0
def test_naivebayes(traindata, testdata):
    #raw pixel feature
    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    for p in range(10, 101, 10):
        print("Training with %d" % int(p * traindata.number * 0.01))
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.orderedout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        print(a)
 def test_naivebayes_labeling(self):
     naivebayes = NaiveBayes()
     corpus = Serializer.load_csv('../resource/corpus.tsv')
     data = []
     target = []
     for row in corpus:
         data.append(str(row[0]))
         t = int(row[1])
         if t > 5:
             raise Exception(t)
         target.append(t)
     np.array(target, dtype=np.uint8, ndmin=1)
    def test_naivebayes_compare(self):
        basepath = '../resource/'
        naivebayes = NaiveBayes()
        json_data = Serializer.load_json(os.path.join(basepath, 'ocr.json'))
        naivebayes.human_labels = json_data['translate']['country']
        x_list = ['ネツァワル王国', 'カセドリア連合王国', 'ゲブランド帝国', 'ホルデイン王国', 'エルソード王国']

        print(json_data['translate']['country'])
        out = naivebayes.predict_all(x_list)
        for i, y in enumerate(out):
            if x_list[i] != y:
                raise Exception('compare x:{0},predict:{1}'.format(
                    x_list[i], y))
Beispiel #12
0
def main(flag=True):
    if flag:
        start = timer()
        # 加载邮件数据的label
        label_df = pd.read_csv("./input/trec06c/full/index_bak",
                               sep=' ..',
                               names=['label', 'filename'])

        for key in label_df['label'].unique():
            print(key, len(label_df[label_df['label'] == key]))

        train, valid = train_test_split(label_df,
                                        test_size=0.2,
                                        random_state=2018)

        normFilelen = train[train['label'] == 'ham'].shape[0]
        spamFilelen = train[train['label'] == 'spam'].shape[0]

        model = NaiveBayes(normFilelen, spamFilelen)
        # model.getStopWords()

        for index, row in tqdm(train.iterrows(), total=train.shape[0]):
            # 将每封邮件出现的词保存在wordsList中
            model.get_word_list('./input/trec06c' + row['filename'],
                                row['label'])
        print('训练集学习完毕,已耗时%2fs' % (timer() - start))

        for index, row in tqdm(valid.iterrows(), total=valid.shape[0]):
            if 'test' in model.wordDict.keys():
                model.wordDict['test'].clear()
            model.get_word_list('./input/trec06c' + row['filename'], 'test')
            wordProbList = model.getTestWords(model.wordDict['test'])
            # 对每封邮件得到的15个词计算贝叶斯概率
            trash_p = model.calBayes(wordProbList)
            if row['label'] == 'spam':
                if trash_p > 0.9:
                    model.validResult['TN'] += 1  # trash
                else:
                    model.validResult['FN'] += 1  # normal
            else:
                if trash_p > 0.9:
                    model.validResult['FP'] += 1  # trash
                else:
                    model.validResult['TP'] += 1  # normal
        model.calMetric()
        print('验证集处理完毕,已耗时%2fs' % (timer() - start))
        pickle.dump(model, open('bayes_model.obj', 'wb'))
    else:
        model = pickle.load(open('bayes_model.obj', 'rb'))
        print("模型加载成功!")
    return model
def train(instances):
    print('starting training')
    p = None
    if args.algorithm == 'lambda_means':
        p = LambdaMeans(args.cluster_lambda, max_max_index,
                        args.clustering_training_iterations)
        p.train(instances)
    elif args.algorithm == 'nb_clustering':
        p = NaiveBayes(args.num_clusters, max_max_index,
                       args.clustering_training_iterations)
        p.train(instances)

    print('ending training')
    return p
Beispiel #14
0
 def train(self, train_set):
     """Teaches the classifier with labeled data instances."""
     for d in train_set:
         self.corpus.add_doc(d)
     print 'Training on %d documents...\n' % len(train_set)
     if isinstance(self.classifier, NaiveBayes):
         self.classifier.train(self.corpus)
         for c in self.corpus.get_classes():
             if len(c.get_classes()) > 1:
                 subclassifier = NaiveBayes()
                 subclassifier.train(c)
                 self.subclassifiers[c.get_label()] = subclassifier
     else:   # for nltk classifiers
         labeled_feature_set = [(d.get_features(), d.get_labels()[0]) for d in train_set]
         self.classifier.train(labeled_feature_set)  # Sklearn classifiers
Beispiel #15
0
    def main():
        nb = NaiveBayes()
        nb.load_data_training()
        nb.mulai_training()

        # TODO: [LANGKAH-10] Cobalah untuk melakukan prediksi!
        # Apbila cuacanya 'Hujan', suhunya 'Dingin', tingkat kemalasannya 'tinggi', dan 'Bangun siang',
        # mahasiswanya masuk atau bolos?

        hasil_prediksi = nb.prediksi(nilai_cuaca='Hujan',
                                     nilai_suhu='Dingin',
                                     nilai_tingkat_malas='Tinggi',
                                     nilai_bangun_siang='Ya')
        print('=====================================')

        print('Hasil akhir prediksi = {}, dengan peluang sebesar {}%'.format(
            hasil_prediksi['hasil'], (hasil_prediksi['peluang'] * 100)))
def trainTrendClassifier():
    """
    :return: None

    This function instantiates a model of the NaiveBayes class and trains the model on the
    categorized trends data. The trained model is stored in the database for future
    classification purpose.
    """
    logger.debug("trainTrendsClassifier()")
    trainingFolder = config['training']['trends']
    trainingDocs, trainingLabels = getData(trainingFolder)
    logger.debug("documents: " + str(len(trainingDocs)) + ", labels: " +
                 str(len(trainingLabels)))

    model = NaiveBayes()

    model.train(trainingDocs, trainingLabels, stopWordsFlag=True, stem=True)
    model.saveToDB()
Beispiel #17
0
def stdmean():
    limit = 0.7
    ratio = 0.8
    times = 5
    print("digit")
    traindata, testdata = dataloader_digit()
    sal = []
    mal = []
    pal = []

    for p in range(10, 101, 10):
        al = []
        il = []
        for i in range(times):
            images, labels = traindata.shuffleout(p)
            pc = Perceptron(traindata.width * traindata.height,
                            traindata.labeldomain)
            pc.train(images, labels, 3, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
    plt.plot(pal, sal, label="digitdata Perceptron std")
    plt.plot(pal, mal, label="digitdata Perceptron mean")

    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    sal = []
    mal = []
    pal = []
    for p in range(10, 101, 10):
        al = []
        for i in range(3):
            images, labels = traindata.shuffleout(p)
            nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
            nb.train(images, labels)
            x = nb.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
        print(a)
    plt.plot(pal, sal, label="digitdata NaiveBayes std")
    plt.plot(pal, mal, label="digitdata NaiveBayes mean")

    sal = []
    mal = []
    pal = []
    for p in range(10, 101, 10):

        al = []
        il = []
        for i in range(times):
            images, labels = traindata.shuffleout(p)
            pc = NeuralNetwork((traindata.width * traindata.height, 15, 15,
                                len(traindata.labeldomain)),
                               traindata.labeldomain)
            pc.train(images, labels, 50, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
        print(a)
    plt.plot(pal, sal, label="digitdata NeuralNetwork std")
    plt.plot(pal, mal, label="digitdata NeuralNetwork mean")

    print("face")
    traindata, testdata = dataloader_face()
    sal = []
    mal = []
    pal = []

    for p in range(10, 101, 10):

        al = []
        il = []
        for i in range(times):
            images, labels = traindata.shuffleout(p)
            pc = Perceptron(traindata.width * traindata.height,
                            traindata.labeldomain)
            pc.train(images, labels, 3, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
    plt.plot(pal, sal, label="facedata Perceptron std")
    plt.plot(pal, mal, label="facedata Perceptron mean")

    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    sal = []
    mal = []
    pal = []
    for p in range(10, 101, 10):
        al = []

        for i in range(3):
            images, labels = traindata.shuffleout(p)
            nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
            nb.train(images, labels)
            x = nb.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
        print(a)
    plt.plot(pal, sal, label="facedata NaiveBayes std")
    plt.plot(pal, mal, label="facedata NaiveBayes mean")

    sal = []
    mal = []
    pal = []
    for p in range(10, 101, 10):

        al = []
        il = []

        for i in range(times):
            images, labels = traindata.shuffleout(p)
            pc = NeuralNetwork((traindata.width * traindata.height, 15, 15,
                                len(traindata.labeldomain)),
                               traindata.labeldomain)
            pc.train(images, labels, 50, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
        sal.append(np.std(al))
        mal.append(np.mean(al))
        pal.append(p)
        print(a)
    plt.plot(pal, sal, label="facedata NeuralNetwork std")
    plt.plot(pal, mal, label="facedata NeuralNetwork mean")

    leg = plt.legend(ncol=1, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.xlabel("data size precentage")
    plt.ylabel("time(in second)")
    plt.show()
from datetime import datetime
from flask import Flask, render_template, request
from search import Search
from naivebayes import NaiveBayes
from imagesearch import ImageSearch
from imagecaption import ImageCaption
from sklearn.model_selection import train_test_split
from zipfile import ZipFile

with ZipFile('pre_processed_data.zip', 'r') as zipObj:
    zipObj.extractall()

search = Search()
search.init()

naive_bayes = NaiveBayes()
naive_bayes.init()

imagesearch = ImageSearch()
imagesearch.init()

image_caption = ImageCaption()
image_caption.init()

app = Flask(__name__)


@app.route("/")
def hello():
    return '<a href="/search">Search</a><br><a href="/classify">Classify</a><br><a href="/image_search">Image Search</a>'
Beispiel #19
0
def main(argv):
    import fileinput
    import getopt

    def usage():
        print(f'usage: {argv[0]} '
              '[-d] [-o path] [-i path] [-r ratio] [-s supports] [-v vars] '
              'feats.db [items ...]')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'do:i:r:s:v:')
    except getopt.GetoptError:
        return usage()
    debug = 0
    outpath = None
    inpath = None
    ratio = 0.5
    maxsupports = 3
    types = {}
    C = 0.4  # distance weight should be (1.5**d) ~= exp(0.4*d)
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-o': outpath = v
        elif k == '-i': inpath = v
        elif k == '-r': ratio = float(v)
        elif k == '-s': maxsupports = int(v)
        elif k == '-v': types = getvars(v)
    assert inpath is None or outpath is None
    if outpath is not None and os.path.exists(outpath):
        print('Already exists: %r' % outpath)
        return 1
    if not args: return usage()
    dbpath = args.pop(0)
    db = FeatDB(dbpath)
    nallitems = len(db.get_items())
    defaultnames = getdefaultnames(types)
    items = db.get_items()
    if args:
        items = [(tid, item) for (tid, item) in items if item in args]

    def learn(tid, item, fids):
        name = stripid(item)
        words = splitwords(name)
        (count, _) = fids[0]
        feats = {feat: fc for (feat, (fc, _)) in fids.items()}
        for w in words:
            nb.adddict(w, count, feats)
        return True

    def predict(tid, item, fids):
        name = stripid(item)
        words = splitwords(name)
        (count, _) = fids[0]
        feats = {fid: fc for (fid, (fc, _)) in fids.items() if fid != 0}
        # Use only prominent features that appears more than a certain threshold.
        threshold = int(max(feats.values()) * ratio)
        f2 = [feat for (feat, fc) in feats.items() if threshold <= fc]
        for w in words:
            nb.removedict(w, count, feats)
        cands = nb.getkeyfeats(f2)[:len(words)]
        for w in words:
            nb.adddict(w, count, feats)
        if not cands: return False
        cwords = [w for (_, w, _) in cands]
        topword = cwords[0]
        if topword in words: return True
        print('+ITEM', json.dumps(item))
        print('+WORDS', json.dumps(words))
        print('+CANDS', json.dumps(cwords))
        if item in defaultnames:
            print('+DEFAULT', json.dumps(defaultnames[item]))
        fids0 = db.get_feats(tid, source=True)
        srcs0 = {0: [], 1: [], -1: []}
        for (fid, (_, srcs)) in fids0.items():
            if fid == 0:
                d = 0
            else:
                d = db.get_feat(fid)[0]
            if d in srcs0:
                srcs0[d].extend(srcs)
        print(
            '+SOURCE',
            json.dumps([(d, list(set(srcs))) for (d, srcs) in srcs0.items()]))
        supports = []
        for (_, w, a) in cands:
            # Find top N features for each word.
            fs = []
            for (fid, c) in a[1:]:
                feat = db.get_feat(fid)
                assert feat is not None
                # A rarer feature overall means stronger indication.
                df = math.log(nallitems / db.get_numfeatitems(fid))
                # A more prominent feature for this category means stronger indication.
                ff = c / nb.fcount[fid][None]
                # Discount a "distant" feature from the subject.
                ds = math.exp(-C * abs(feat[0]))
                fs.append((ds * df * ff, fid, feat))
            fs = sorted(fs, reverse=True)[:maxsupports]
            score = sum(s for (s, _, _) in fs)
            # Find the variables that contains the same feature.
            ss = []
            for (_, fid, _) in fs:
                found = None
                (_, srcs0a) = fids0[0]
                (_, srcs0b) = fids0[fid]
                tids = db.get_featitems(fid)
                for tid1 in tids.keys():
                    if tid1 == tid: continue
                    item1 = db.get_item(tid1)
                    name1 = stripid(item1)
                    if w not in splitwords(name1): continue
                    fids1 = db.get_feats(tid1, source=True)
                    (_, srcs1a) = fids1[0]
                    (_, srcs1b) = fids1[fid]
                    found = (srcs0a + srcs0b, item1, srcs1a + srcs1b)
                    break
                ss.append(found)
            supports.append((w, score, list(zip(fs, ss))))
        print('+SCORE', json.dumps(sum(score for (_, score, _) in supports)))
        print('+SUPPORTS', json.dumps(supports))
        print()
        return False

    nb = NaiveBayes()
    proc = learn
    if inpath is not None:
        print(f'Importing model: {inpath!r}', file=sys.stderr)
        with open(inpath, 'rb') as fp:
            nb.load(fp)
            proc = predict

    n = m = 0
    for (tid, item) in items:
        fids = db.get_feats(tid)
        n += 1
        if proc(tid, item, fids):
            m += 1
        sys.stderr.write('.')
        sys.stderr.flush()
    print(f'\nProcessed: {m}/{n}', file=sys.stderr)

    if outpath is not None:
        print(f'Exporting model: {outpath!r}', file=sys.stderr)
        with open(outpath, 'wb') as fp:
            nb.save(fp)

    if inpath is None and outpath is None:
        for (tid, item) in items:
            fids = db.get_feats(tid)
            predict(tid, item, fids)

    return 0
        with open("winter-" + classifier + ".json") as json_file:
            for line in json_file:
                json_obj = json.loads(line)
                reviews += [(classifier, json_obj)]

#  Creating model objects
model = args.model
if (model == "baseline"):
    model_obj = BaseLine(reviews, categories)

elif (model == "logreg"):
    model_obj = LogReg(reviews)

elif (model == "multinomialNB"):
    model_obj = NaiveBayes(reviews, "multinomial")

elif (model == "lda"):
    model_obj = TopicModel(reviews)

elif (model == "kNearestNeighbors"):
    model_obj = knn(reviews, target)

else:  # put additional models here.
    print("Argument Error: invalid model specified")
    sys.exit()

model_classified = []  #  classifications stored here
reviews = []  #  resetting reviews list to save memory

#  Reading test data into reviews list
Beispiel #21
0
##datafile = "../data/weather.nominal.txt"
##pos_class = "play:yes"
##pos_class = "play:no"

##datafile = "haireyescolor.txt"
##pos_class = "Sex:Male"
##pos_class = "Sex:Female"

##datafile = "../data/cmc-full.txt"
##pos_class = "contraceptive-method:none"
##pos_class = "contraceptive-method:long-term"
##pos_class = "contraceptive-method:short-term"

d = Data(datafile)
prnb = NaiveBayes(d)
##prnb = MaxAPost(d)
prnb.train()

pos = 0.0
neg = 0.0

for (v, c_true) in d.test_set:
    if c_true == pos_class:
        pos += 1
    else:
        neg += 1

result_pos = []
result_neg = []
result_dif = []
        df['text'], df['is_spam'], test_size=0.2, random_state=191)

    print('Data set:')
    print('{} total'.format(df.shape[0]))
    for t, t_name in zip(targets, target_names):
        print('{} {}'.format(len(df[df['is_spam'] == t]), t_name))

    print('\nTraining set:')
    print('{} total'.format(len(X_train)))
    for t, t_name in zip(targets, target_names):
        print('{} {}'.format(sum([y == t for y in y_train]), t_name))

    print('\nTest set:')
    print('{} total'.format(len(X_test)))
    for t, t_name in zip(targets, target_names):
        print('{} {}'.format(sum([y == t for y in y_test]), t_name))
    print('')

    # Build Classifier
    gvoc_model = NaiveBayes('General Vocabulary', X_train,
                            y_train, targets, target_names)
    gvoc_model.train()

    gvoc_model.evaluate(X_test, y_test, show_top_features=10)

    rvoc_model = NaiveBayes('Reduced Vocabulary', X_train, y_train, targets,
                            target_names, max_features=200)
    rvoc_model.train()

    rvoc_model.evaluate(X_test, y_test, show_top_features=10)
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)


X, y = datasets.make_blobs(n_samples=1000,
                           n_features=2,
                           centers=3,
                           cluster_std=1.0,
                           center_box=(-10.0, 10.0),
                           shuffle=True,
                           random_state=123,
                           return_centers=False)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.9,
                                                    random_state=1234)

clf = NaiveBayes()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(accuracy(y_test, y_pred))

color_map = {0: 'r', 1: 'k', 2: 'g'}

label_color = [color_map[l] for l in y_pred]
plt.scatter(X_test[:, 0], X_test[:, 1], c=label_color)
plt.show()
Beispiel #24
0
def timeana():
    import time
    limit = 0.7
    ratio = 1
    times = 200
    print("digit")
    traindata, testdata = dataloader_digit()
    fal = []
    pal = []

    for p in range(20, 101, 10):
        images, labels = traindata.orderedout(p)
        al = []
        il = []
        start = time.time()
        pc = Perceptron(traindata.width * traindata.height,
                        traindata.labeldomain)
        for i in range(times):
            pc.train(images, labels, 1, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
            if a > limit:
                end = time.time()
                break
        fal.append(end - start)
        pal.append(p)
    plt.plot(pal, fal, label="digitdata Perceptron")

    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    fal = []
    pal = []
    for p in range(20, 101, 10):
        start = time.time()
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.orderedout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        end = time.time()
        fal.append(end - start)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="digitdata NaiveBayes")

    fal = []
    pal = []
    for p in range(20, 101, 10):
        images, labels = traindata.orderedout(p)
        al = []
        il = []
        start = time.time()
        pc = NeuralNetwork((traindata.width * traindata.height, 15, 15,
                            len(traindata.labeldomain)), traindata.labeldomain)
        for i in range(times):
            pc.train(images, labels, 1, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
            if a > limit:
                end = time.time()
                break
        fal.append(end - start)
        pal.append(p)
    plt.plot(pal, fal, label="digitdata NeuralNetwork")

    print("face")
    traindata, testdata = dataloader_face()
    fal = []
    pal = []

    for p in range(20, 101, 10):
        images, labels = traindata.orderedout(p)
        al = []
        il = []
        start = time.time()
        pc = Perceptron(traindata.width * traindata.height,
                        traindata.labeldomain)
        for i in range(times):
            pc.train(images, labels, 1, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
            if a > limit:
                end = time.time()
                break
        fal.append(end - start)
        pal.append(p)
    plt.plot(pal, fal, label="facedata Perceptron")

    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    fal = []
    pal = []
    for p in range(20, 101, 10):
        start = time.time()
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.orderedout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        end = time.time()
        fal.append(end - start)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="facedata NaiveBayes")

    fal = []
    pal = []
    for p in range(20, 101, 10):
        images, labels = traindata.orderedout(p)
        al = []
        il = []
        start = time.time()
        pc = NeuralNetwork((traindata.width * traindata.height, 15, 15,
                            len(traindata.labeldomain)), traindata.labeldomain)
        for i in range(times):
            pc.train(images, labels, 1, ratio)
            x = pc.classify(testdata.images)
            a = Accuracy(x, testdata.labels)
            al.append(a * 100)
            il.append(i + 1)
            print(a * 100)
            if a > limit:
                end = time.time()
                break
        fal.append(end - start)
        pal.append(p)
    plt.plot(pal, fal, label="facedata NeuralNetwork")

    leg = plt.legend(ncol=1, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.xlabel("data size precentage")
    plt.ylabel("time(in second)")
    plt.show()
Beispiel #25
0
from data import Data
from naivebayes import NaiveBayes

filename = "datasets/weatherNominal.td"
## filename = "datasets/titanic.td"
## filename = "datasets/cmc.td"

d = Data(filename)
d.report()

pr = NaiveBayes(d)
pr.train()
pr.show()

for (v, c_true) in d.test_set:
    c_pred = pr.predict(v)[0]
    print(v, ":")
    print("   ", c_pred, "( true class:", c_true, ")")

##    print(pr.predict(("Class:1st","Sex:Female","Age:Child")))

##    print(pr.predict(("Class:Crew","Sex:Female","Age:Child")))
Beispiel #26
0
def test_naivebayes_argmax_all():

    traindata, testdata = dataloader_digit()
    feature_domians = [[i for i in np.arange(0, 1.1, 0.5)]
                       for _ in range(traindata.width * traindata.height)]
    fal = []
    pal = []
    for p in range(10, 101, 10):
        print("Training with %d" % int(p * traindata.number * 0.01))
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.orderedout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        fal.append(a * 100)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="digitdata order")
    fal = []
    pal = []
    for p in range(10, 101, 10):
        print("Training with %d" % int(p * traindata.number * 0.01))
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.shuffleout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        fal.append(a * 100)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="digitdata random")

    traindata, testdata = dataloader_face()
    feature_domians = [[0, 1]
                       for _ in range(traindata.width * traindata.height)]
    fal = []
    pal = []
    for p in range(10, 101, 10):
        print("Training with %d" % int(p * traindata.number * 0.01))
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.orderedout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        fal.append(a * 100)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="facedata order")
    fal = []
    pal = []
    for p in range(10, 101, 10):
        print("Training with %d" % int(p * traindata.number * 0.01))
        nb = NaiveBayes(feature_domians, traindata.labeldomain, 1)
        images, labels = traindata.shuffleout(p)
        nb.train(images, labels)
        x = nb.classify(testdata.images)
        a = Accuracy(x, testdata.labels)
        fal.append(a * 100)
        pal.append(p)
        print(a)
    plt.plot(pal, fal, label="facedata random")

    leg = plt.legend(ncol=1, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.xlabel("data size precentage")
    plt.ylabel("accuracy")
    plt.show()
#  Creating model objects
model = args.model
if (model == "baseline"):
    model_obj = BaseLine(reviews, {
        "winter": 0,
        "spring": 0,
        "summer": 0,
        "fall": 0
    })
elif (model == "kNearestNeighbors"):
    model_obj = knn(reviews, target)
elif (model == "logreg"):
    model_obj = LogReg(reviews)

elif (model == "multinomialNB"):
    model_obj = NaiveBayes(reviews, "multinomial")

elif (model == "gaussianNB"):
    model_obj = NaiveBayes(reviews, "gaussian")

elif (model == "lda"):
    model_obj = TopicModel(reviews)

else:  # put additional models here.
    print("Argument Error: invalid model specified")
    sys.exit()

model_classified = []  #  classifications stored here
reviews = []  #  resetting reviews list to save memory

#  Reading test data into reviews list
Beispiel #28
0
 def __init__(self):
     self.classifier = NaiveBayes()
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

from naivebayes import NaiveBayes


def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


X, y = datasets.make_classification(n_samples=1000,
                                    n_features=10,
                                    n_classes=2,
                                    random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
Beispiel #30
0
def naivebayes(trainf, testf):
    nb = NaiveBayes(trainf)
    nb.classify(testf)