Exemple #1
0
def get_lambda(parts, alpha):
    for i in range(0, 101, 5):
        lambda_legit = 10**i
        sum_accuracy_score = 0
        cnt = 0
        for test in parts:
            d_tran = []
            predict = []
            answer = []

            for part in parts:
                if part != test:
                    d_tran = d_tran + part

            bayes = train(d_tran)

            for answer_class, message in test:
                predict_class, _ = classify(bayes, message, 1, lambda_legit,
                                            alpha)
                predict.append(int(predict_class == SPAM))
                answer.append(int(answer_class == SPAM))

                if predict_class == SPAM and answer_class == LEGIT:
                    cnt += 1

            sum_accuracy_score += accuracy_score(predict, answer)

        print("cnt:", cnt, "lambda:", 10**i)
Exemple #2
0
def test():
    """
    the probability reach 1 represent badness 
    """
    listpost, listclass = bayes.loaddataset()
    myvocablist = bayes.createlist(listpost)
    tmatrix = list()
    for doc in listpost:
        vec = bayes.word2vec(myvocablist, doc)
        tmatrix.append(vec)
    p0, p1, pa = bayes.train(tmatrix, listclass)
    testdoc1 = ['love', 'my', 'dalmation']
    testvec1 = bayes.word2vec(myvocablist, testdoc1)
    print testdoc1, 'classify as :', bayes.classify(testvec1, p0, p1, pa)
    testdoc2 = ['stupid', 'love']
    testvec2 = bayes.word2vec(myvocablist, testdoc2)
    print testdoc2, 'classify as :', bayes.classify(testvec2, p0, p1, pa)
Exemple #3
0
def final_result(text):

	result = classify(text)

	if result < 0.35:
		return "Positive"
	elif result>0.65:
		return "Negative"
	else:
		return "Neutral"
Exemple #4
0
def train(pkgDict, settings=None):
    test, classCorrect, count = [], 0, 0
    for key in pkgDict.keys():
        matrix = [np.asarray(item) for item in pkgDict[key][TRAIN]]
        pkgDict[key][COVARIANCE] = b.getCovarianceMatrix(matrix, settings[TEST_STRATEGY])
        pkgDict[key][MEAN] = np.mean(pkgDict[key][TRAIN], axis=0)
        test.append(pkgDict[key][TEST])
    for key in pkgDict.keys():
        for sample in pkgDict[key][TEST]:
            classifiedClass = b.classify(pkgDict, sample, settings[TEST_STRATEGY])
            classCorrect = classCorrect + 1 if key == classifiedClass else classCorrect
            count += 1
    return classCorrect/count
Exemple #5
0
def draw_roc(parts, alpha):
    d_train = []

    for part in parts:
        d_train = d_train + part

    bayes = train(d_train)

    roc_c = []
    roc_pred = []
    count_y = 0
    count_x = 0
    for answer_class, message in d_train:
        predict_class, pred = classify(bayes, message, 1, 1, alpha)

        if predict_class == answer_class:
            count_y += 1
            roc_c.append(True)
        else:
            count_x += 1
            roc_c.append(False)

        roc_pred.append(pred)

    roc_c, roc_pred = zip(*sorted(zip(roc_c, roc_pred), key=lambda x: x[1]))

    if count_y == 0:
        count_y = 1

    if count_x == 0:
        count_x = 1

    shag_x = 1 / count_x
    shag_y = 1 / count_y
    X = []
    Y = []
    x = 0
    y = 0
    X.append(x)
    Y.append(y)
    for t in roc_c:
        if t:
            y += shag_y
        else:
            x += shag_x

        X.append(x)
        Y.append(y)

    plt.plot(X, Y)
    plt.show()
Exemple #6
0
def spam_test():
    #使用贝叶斯垃圾邮件分类器
    doc_list = []  #包含所有文件解析后的词列表(二维)
    class_list = []
    full_text = []  #包含所有出现词语的列表(一维)
    text_num = 50
    for i in range(1, 26):  #共25个文件
        try:
            big_string = open('email/spam/%d.txt' % i).read()  #导入垃圾邮件文件
            word_list = text_parse(big_string)  #解析为词列表
            doc_list.append(word_list)  #加入所有文件的总列表中
            full_text.extend(word_list)  #加入所有词语的列表中
            class_list.append(1)  #将类别设为1
        except:
            text_num -= 1
        try:
            big_string = open('email/ham/%d.txt' % i).read()  #导入非垃圾邮件文件
            word_list = text_parse(big_string)
            doc_list.append(word_list)
            full_text.extend(word_list)
            class_list.append(0)
        except:
            text_num -= 1
    print(text_num)
    vocab_list = bayes.create_vocab_list(doc_list)  #去重列表(二维)
    training_set = list(range(text_num))  #全部样本集数
    test_set = []  #测试集
    for i in range(10):  #从样本集中随机挑选10个作为测试集(下标)
        rand_index = int(random.uniform(0, len(training_set)))  #随机整数
        test_set.append(training_set[rand_index])  #加入测试集
        del (training_set[rand_index])  #从训练集中删掉
    train_mat = []  #训练矩阵
    train_class = []  #训练矩阵的元素类别
    for doc_index in training_set:
        words = bayes.set_of_words2vec(
            vocab_list,
            doc_list[doc_index])  #检测doc_list[i]中的词语是否出现在vocab_list中,返回0-1向量
        train_mat.append(words)  #放入矩阵中
        train_class.append(class_list[doc_index])  #更新类别
    p_0v, p_1v, p_spam = bayes.train_nb0(array(train_mat),
                                         array(train_class))  #求出相关概率
    error_count = 0
    for doc_index in test_set:  #测试数据
        word_vec = bayes.set_of_words2vec(vocab_list, doc_list[doc_index])
        result = bayes.classify(array(word_vec), p_0v, p_1v,
                                p_spam)  #与相关概率比较得到分类结果
        if result != class_list[doc_index]:
            error_count += 1
    print('The error rate is :', float(error_count / len(test_set)))  #计算错误率
    return float(error_count / len(test_set))
Exemple #7
0
def create_answer(data, token):
    user_id = data['user_id']
    message = get_answer(data['body'].lower())
    cat = bayes.classify(sentence=message)
    if cat[0] == 'news':
        res = yandex.Parse_scv()
        for news in res:
            vkapi.send_message(user_id, token, news)
    elif cat[0] == 'weather':
        res = weather.TakeWeather()
        vkapi.send_message(user_id, token, 'Погода за окном ' + res['status'])
        vkapi.send_message(user_id, token, 'Облачность ' + res['description'])
        vkapi.send_message(user_id, token, 'Восход ' + res['sunrise'])
        vkapi.send_message(user_id, token, 'Закат ' + res['sunset'])
        vkapi.send_message(user_id, token, 'Скорость ветра = ' + str(res['windspeed']))
    elif cat[0] == 0:
        vkapi.send_message(user_id, token, 'Простите, я вас не понимаю')
def emailTest():
    # 储存所有文档,以一个列表的形式保存
    allDoclist = []
    # 储存所有的邮件,每个邮件属于一个列表
    allEmail = []
    # 储存每封邮件的类别
    classList = []
    for i in range(1, 26):
        with open('../email/spam/%d.txt' % i) as text:
            wordList = textParse(text.read())
            allDoclist.append(wordList)
            allEmail.extend(wordList)
            classList.append(1)
        with open('../email/ham/%d.txt' % i) as text:
            wordList = textParse(text.read())
            allDoclist.append(wordList)
            allEmail.extend(wordList)
            classList.append(0)
    # 将所有文档中的单词去除重复
    newWordsList = bayes.createNewList(allDoclist)
    # 邮件总数为50,从中划分测试集和训练集
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        # 随机数最开始的浮点数没有取到首尾,取Int后有首尾0,最后
        randIndex = int(random.uniform(0, len(trainingSet)))
        # 添加测试集
        testSet.append(trainingSet[randIndex])
        # 删除训练集中对应部分
        del trainingSet[randIndex]
    trainingVec = []
    trainClass = []
    for trainingIndex in trainingSet:
        trainingVec.append(
            bayes.words2Vec(newWordsList, allDoclist[trainingIndex]))
        trainClass.append(classList[trainingIndex])
    vecP1, vecP0, p1 = bayes.trainNB(trainingVec, trainClass)
    errorCount = 0
    for testIndex in testSet:
        testVec = bayes.words2Vec(newWordsList, allDoclist[testIndex])
        if bayes.classify(testVec, vecP1, vecP0, p1) != classList[testIndex]:
            errorCount += 1
            print 'classification error: ', allDoclist[testIndex]
    print 'The error rate is: ', float(errorCount) / len(testSet)
Exemple #9
0
def local_words(feed1,feed0):
    #从个人广告中获取区域倾向
    doc_list=[]
    class_list=[]
    full_test=[]
    min_len=min(len(feed1['entries']),len(feed0['entries']))
    print(min_len)
    for i in range(min_len):    #每次访问一条rss源
        word_list=bayes_2.text_parse(feed1['entries'][i]['summary'])    #解析feed1得到的长字符串,返回字符串列表
        doc_list.append(word_list)      #将这次获得字符串列表放到总列表中
        full_test.extend(word_list)     #包含所有单词(可重复)
        class_list.append(1)        #标记为1(来源feed1)
        word_list=bayes_2.text_parse(feed0['entries'][i]['summary'])    #解析feed0得到的长字符串,返回字符串列表
        doc_list.append(word_list)
        full_test.extend(word_list)
        class_list.append(0)
    vocab_list=bayes.create_vocab_list(doc_list)    #得到一个去重总词集
    top30_words=cal_most_freq(vocab_list,full_test) #获得频数最高的30个词
    for pair in top30_words:        #从去重词集中去掉这30个词
        if pair[0] in vocab_list:
            vocab_list.remove(pair[0])
    training_set=list(range(2*min_len)) #训练集下标
    test_set=[] #测试集
    for i in range(20): #随机挑选20个样本作为测试集
        rand_index=int(random.uniform(0,len(training_set)))
        test_set.append(training_set[rand_index])
        del(training_set[rand_index])
    train_mat=[]
    train_class=[]
    for doc_index in doc_list:  #训练模型
        train_mat.append(bayes.bag_of_words2vec(vocab_list,doc_list[doc_index]))
        train_class.append(class_list[doc_index])
    p_0v,p_1v,p_spam=bayes.train_nb0(array(train_mat),array(train_class))
    error_count=0
    for doc_index in test_set:  #测试模型,计算错误
        word_vec=bayes.bag_of_words2vec(vocab_list,doc_list[doc_index])
        result=bayes.classify(word_vec,p_0v,p_1v,p_spam)
        if result != class_list[doc_index]:
            error_count+=1
    print('The error rate is:',float(error_count/len(test_set)))
    return vocab_list,p_0v,p_1v
Exemple #10
0
def get_best_alpha(parts):
    best_accuracy = 0
    best_alpha = 0

    for alpha_degree in range(0, 10):
        alpha = 1 / (10**alpha_degree)
        sum_accuracy_score = 0

        for d_test in parts:
            d_train = []
            predict = []
            answer = []

            for part in parts:
                if part != d_test:
                    d_train = d_train + part

            bayes = train(d_train)

            for answer_class, message in d_test:
                predict_class, _ = classify(bayes, message, 1, 1, alpha)
                predict.append(int(predict_class == SPAM))
                answer.append(int(answer_class == SPAM))

            sum_accuracy_score += accuracy_score(predict, answer)

        accuracy = sum_accuracy_score / COUNT_PARTS
        if best_accuracy < accuracy:
            best_accuracy = accuracy
            best_alpha = alpha

        print('alpha:', alpha, 'accuracy:', accuracy)

    print("__________________________________________")
    print("Best alpha:", best_alpha)
    print("Best accuracy:", best_accuracy)
    print("__________________________________________")

    return best_alpha
Exemple #11
0
def main(keyword):

	
	
	pos, neg, net = 0, 0, 0
	try:
		tweets = twitter_fetch(keyword)
	
	except:
		print "Connection Error"

	for each_tweet in tweets:
		result = classify(each_tweet)

		if result < 0.35:
			pos = pos + 1
		elif result>0.65:
			neg = neg + 1
		else:
			net = net + 1


	return pos,neg,net
Exemple #12
0
def draw_accuracy_from_lambda(parts, alpha):
    all_lambda_legit = []
    all_accuracy = []

    for i in range(0, 101, 5):
        lambda_legit = 10**i
        sum_accuracy_score = 0

        for test in parts:
            d_train = []
            predict = []
            answer = []

            for part in parts:
                if part != test:
                    d_train = d_train + part

            bayes = train(d_train)

            for answer_class, message in test:
                predict_class, _ = classify(bayes, message, 1, lambda_legit,
                                            alpha)
                predict.append(int(predict_class == SPAM))
                answer.append(int(answer_class == SPAM))

            sum_accuracy_score += accuracy_score(predict, answer)

        accuracy = sum_accuracy_score / COUNT_PARTS
        all_lambda_legit.append(i)
        all_accuracy.append(accuracy)
        print("lambda:", lambda_legit, "accuracy:", accuracy)

    plt.plot(all_lambda_legit, all_accuracy)
    plt.xlabel('10^x lambda legit')
    plt.ylabel('Accuracy')
    plt.show()
Exemple #13
0
import bayes

dataset, labels = bayes.load_dataset()
print(dataset)
print(labels)
vocab_list = bayes.create_vocab_list(dataset)
print(vocab_list)
matrix = []
for array in dataset:
    vec = bayes.words_set_to_vec(vocab_list, array)
    matrix.append(vec)
print(matrix)
p_0_v, p_1_v, p_ab = bayes.train(matrix, labels)
print(p_0_v)
print(p_1_v)
print(p_ab)
print('<--->')
test = ['love', 'my', 'dalmation']
vec = bayes.words_set_to_vec(vocab_list, test)
classify = bayes.classify(vec, p_0_v, p_1_v, p_ab)
print(test)
print(vec)
print(classify)
print('<--->')
test = ['stupid', 'garbage']
vec = bayes.words_set_to_vec(vocab_list, test)
classify = bayes.classify(vec, p_0_v, p_1_v, p_ab)
print(test)
print(vec)
print(classify)
Exemple #14
0
def validate_classification(roc_step=toolkit.NUM('0.001')):
    """Compute the ROC."""
    true = dict()
    false = dict()
    true[bayes.HAM] = defaultdict(lambda: toolkit.NUM(0))
    true[bayes.SPAM] = defaultdict(lambda: toolkit.NUM(0))
    false[bayes.HAM] = defaultdict(lambda: toolkit.NUM(0))
    false[bayes.SPAM] = defaultdict(lambda: toolkit.NUM(0))
    ham_roc = list()
    spam_roc = list()

    # Compute best features.
    print "Computing the 300 most characteristic features"
    print "Therefore we compute the mutual information of each word with
    respect to the classification"
    print ""
    best_features = features.best_features()[-300:]
    best_features = map(operator.itemgetter(0), best_features)

    ham_files = toolkit.get_files(bayes.HAM + bayes.TEST)
    spam_files = toolkit.get_files(bayes.SPAM + bayes.TEST)
    ham_count = len(ham_files)
    spam_count = len(spam_files)
    test_samples = zip(ham_files + spam_files, ham_count * [bayes.HAM] +
            spam_count * [bayes.SPAM])
    ham_count = toolkit.NUM(ham_count)
    spam_count = toolkit.NUM(spam_count)

    from math import log, ceil
    total_count = ham_count + spam_count
    total_digits = ceil(log(total_count, 10))
    print_msg = "[%%s] Processing file %%0%dd of %d: %%s " % (total_digits, total_count)
    count = 0
    for filename, clss in test_samples:
        count += 1
        print print_msg % (strftime("%H:%M:%S", gmtime()), count, filename)
        threshold = toolkit.ZERO
        while threshold <= toolkit.ONE:
            classification =  bayes.classify(filename, best_features, threshold)
            if (classification == clss):
                true[classification][threshold] += toolkit.ONE
            else:
                false[classification][threshold] += toolkit.ONE
            threshold += roc_step
    threshold = toolkit.ZERO
    while threshold <= toolkit.ONE:
        total_false = toolkit.NUM(false[bayes.HAM][threshold] +
                false[bayes.SPAM][threshold])
        total_true = toolkit.NUM(true[bayes.HAM][threshold] + true[bayes.SPAM][threshold])
        ham_roc.append((false[bayes.HAM][threshold] / total_false,
            true[bayes.HAM][threshold] / total_true))
        spam_roc.append((false[bayes.SPAM][threshold] / total_false,
            true[bayes.SPAM][threshold] / total_true))
        threshold += roc_step
    #roc.reverse()
    print len(ham_roc)
    hamfile = open('ham_roc.dat', 'w')
    spamfile = open('spam_roc.dat', 'w')
    for h_e, s_e in zip(ham_roc, spam_roc):
        hamfile.write(str(h_e[0]) + ' ' + str(h_e[1]) + '\n')
        hamfile.flush()
        spamfile.write(str(s_e[0]) + ' ' + str(s_e[1]) + '\n')
        spamfile.flush()
    hamfile.close()
    spamfile.close()
    return true, false, ham_roc, spam_roc
Exemple #15
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
@Author Amit Joshi
'''

import sys
from bayes import classify
from t2l import txt_to_list

reload(sys)
sys.setdefaultencoding('utf8')

text = raw_input("Enter Nepali Sentence: ")

result = classify(text)

# if result < 0.35:
# 	print text, "=> Positive"
# elif result > 0.65:
# 	print text, "=> Negative"
# else:
# 	print text, "=> Neutral"

if result < 0.35:
    print "\nPositive\n"
elif result > 0.65:
    print "\nNegative\n"
else:
    print "\nNeutral\n"
ham = getArticles('data/email/ham.pickle')
spam = getArticles('data/email/spam.pickle')

# Generate training set and test set
random.shuffle(ham)
random.shuffle(spam)

hamTestLen = len(ham) // 3
spamTestLen = len(spam) // 3
testData = ham[:hamTestLen] + spam[:spamTestLen]
testLabels = ['ham' for i in range(hamTestLen)
              ] + ['spam' for i in range(spamTestLen)]
trainData = ham[hamTestLen:] + spam[spamTestLen:]
trainLabels = ['ham' for i in range(len(ham) - hamTestLen)
               ] + ['spam' for i in range(len(spam) - spamTestLen)]

# Train model
wordBag = articles.createWordBag(trainData)
trainData = articles.createDataSet(trainData, wordBag)
model = bayes.train(trainData, trainLabels)

# Test model
correct = 0
testData = articles.createDataSet(testData, wordBag)
for i, data in enumerate(testData):
    res = bayes.classify(data, model)
    if res == testLabels[i]:
        correct += 1
print('Correctness: %d/%d' % (correct, len(testData)))
Exemple #17
0
def main():
	with open('data/SMSSpamCollection') as input_file:
		text = input_file.read()
	text = text.strip()
	text = text.split('\n')

	# stop word cutoffs as per assignment
	stopWords = [0]

	# xSlice our data into five equal segments for fivefold cross validation
	# each segment has random indices
	indices = random.sample(xrange(len(text)),len(text))
	randomData = [text[i] for i in indices]
	stride = len(randomData)/5
	randomSlices = [[],[],[],[],[]]
	for i in range(1,len(randomData)-1,stride+1):
		randomSlices[i/stride] = (randomData[i-1:i+stride-1]) 	
	
	# iterate through all the xSlices and perform training/classification
	for xSlice in range(5):
		trainSet = list()
		testSet = randomSlices[xSlice]
		for i in range(5):
			if i == xSlice:
				continue
			else:
				trainSet = trainSet + randomSlices[i]

		baseDict = textFeatures.getFeatures(trainSet)
	
		# remove n most frequent words
		
		for cutoff in stopWords:
			wordDict = set([baseDict[i][0] for i in range(0,len(baseDict)-cutoff)])	
			tp = 0
			fp = 0
			tn = 0
			fn = 0	
			# build feature vectors (not really, they're hash tables)
			trainSpam,trainHam = textFeatures.vectorize(trainSet,wordDict)
			testSpam,testHam = textFeatures.vectorize(testSet,wordDict)
   
			probTable,pSpam,pHam = bayes.trainClassifier(trainSpam,trainHam,wordDict)	

			for item in testSpam:
				prediction = (bayes.classify(probTable,pSpam,pHam,item))
				if prediction == 'spam':
					tp = tp + 1
				else:
					fn = fn + 1
			for item in testHam:
				prediction = (bayes.classify(probTable,pSpam,pHam,item))
				if prediction == 'ham':
					tn = tn + 1
				else:
					fp = fp + 1
			
			result = {'tp': tp, 'fp': fp, fn: 'fn', 'tn': tn}
			# write results to temporary file
			fName = 'output/expcutoff%dxSlice%d' % (cutoff,xSlice)
			pickle.dump(result,open(fName,'w'))
Exemple #18
0
def main():
    with open('data/SMSSpamCollection') as input_file:
        text = input_file.read()
    text = text.strip()
    text = text.split('\n')

    cutoff = raw_input("How many words to truncate from dictionary: ")
    try:
        cutoff = int(cutoff)
    except:
        print "Invalid input, defaulting to 10"
        cutoff = 10

    # slice our data into five equal segments for fivefold cross validation
    # each segment has random indices
    indices = random.sample(xrange(len(text)), len(text))
    randomData = [text[i] for i in indices]
    stride = len(randomData) / 5
    randomSlices = [[], [], [], [], []]
    for i in range(1, len(randomData) - 1, stride + 1):
        randomSlices[i / stride] = (randomData[i - 1:i + stride - 1])

    print "Entering 'n' will use 1/5th of data for testing, the rest for training"
    print "Entering 'y' will use full cross validation and may take a while"
    crossValidate = raw_input("Perform cross validation? (y/n): ")

    if crossValidate.lower() != 'y' and crossValidate.lower() != 'n':
        print "Invalid input, not using cross validation"
        limit = 1
    elif crossValidate.lower() == 'y':
        limit = 5
    else:
        limit = 1

    resultList = list()
    for xSlice in range(limit):
        trainSet = list()
        testSet = randomSlices[xSlice]
        for i in range(5):
            if i == xSlice:
                continue
            else:
                trainSet = trainSet + randomSlices[i]

        print "Building dictionary..."
        baseDict = textFeatures.getFeatures(trainSet)
        wordDict = set(
            [baseDict[i][0] for i in range(0,
                                           len(baseDict) - cutoff)])

        print "Vectorizing documents..."
        trainSpam, trainHam = textFeatures.vectorize(trainSet, wordDict)
        testSpam, testHam = textFeatures.vectorize(testSet, wordDict)

        print "Training classifier..."
        probTable, pSpam, pHam = bayes.trainClassifier(trainSpam, trainHam,
                                                       wordDict)

        tp, fp, tn, fn = 0.0, 0.0, 0.0, 0.0
        print "Beginning testing..."

        total = len(testSpam) + len(testHam)
        count = 0
        for item in testSpam:
            prediction = bayes.classify(probTable, pSpam, pHam, item)
            if prediction == 'spam':
                tp = tp + 1.0
            else:
                fp = fp + 1.0
            count = count + 1
            if count % 50 == 0:
                print "%d/%d complete" % (count, total)
        for item in testHam:
            prediction = bayes.classify(probTable, pSpam, pHam, item)
            if prediction == 'ham':
                tn = tn + 1.0
            else:
                fp = fp + 1.0
            count = count + 1
            if count % 50 == 0:
                print "%d/%d complete" % (count, total)
        print "Finished testing.."

        result = {'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn}
        resultList.append(result)

    printResults(resultList)
Exemple #19
0
# Written by Ningyuan Jiang

import sys
import c45
import bayes

if __name__ == '__main__':
    with open("mushroom.training") as f:
        dataset = [[str(x) for x in line.split()] for line in f]
    with open("mushroom.test") as f:
        testData = [[str(x) for x in line.split()] for line in f]

    #print C45.splitFeature(dataset)
    dicts, classProb = bayes.computeProbs(dataset)
    testData = bayes.classify(testData, dicts, classProb)
    for i in testData:
        print i
Exemple #20
0
			bayes.train(t, False)
		else:
			bayes.train(t, True)
		total = total + 1
		print total
		if (total % 100) == 0:
			query = "SELECT a.article_id, a.prediction, a.like_flag, b.article_text from test_user_activity a, \
			test_article b where a.article_id = b.article_id and a.article_id between %s and %s and user_name ='brad'" % (row["article_id"] + 1, (row["article_id"] + 101))
			print query
			cur.execute(query) 
			rows = cur.fetchall()
	
			for row in rows:
				t = row["article_text"]
				t = unquote_plus(t)
				prediction = bayes.classify(t)
				rating = bayes.article_rating(t)
				print "Pred: %s, Prob: %.4f" % (prediction, rating)
				print "Prediction for: %d is %s, naive predicted %s and I chose %s. CRM probability was %.4f" % (row["article_id"], prediction, row["prediction"], row["like_flag"], rating)
				#query = """INSERT into test_predictions (article_id, user_name, like_flag, naive_bayes, crm, crm_prob) VALUES (%s, '%s', %s, '%s', '%s', %s)""" % (row["article_id"], "brad", row["like_flag"], row["prediction"], prediction, probability)
				query = """UPDATE test_predictions set pg_bayes = '%s', pg_dislike_score = %.4f where user_name = 'brad' and article_id = '%s'""" % (prediction, rating, row["article_id"])
				print query
				cur.execute(query)

#spam_message = "Viagra, cialis for $2.59!!! Call 555-54-53"
#bayes.train(spam_message, True)
#
#ham_message = "Paul Graham doesn't need Viagra. He is NP-hard."
#bayes.train(ham_message, False)
#
#m1 = "Cheap viagra for 2.59"