def solveAssignment(training_file, test_file, k):

    # read data from both files as it is
    features, training_class, training_data = nb.readFile(training_file)
    f2, ref_ids, test_data = nb.readFile(test_file)

    # find max number of attributes in both the files
    max_attribute, max_attribute_values = nb.findMaxNumAttributes(training_data, test_data)

    # format training and test data which can be used by classifier
    # training_class, training_data = nb.formatData(training_data, max_attribute)
    # test_class, test_data = nb.formatData(test_data, max_attribute)

    # make k classifiers from training_data and class labels using ensemble method adaboost
    kClassifiers, kClassifiers_errors = formEnsembleClassifiers(training_class, training_data, max_attribute_values, k)
    # print kClassifiers_errors
    # predict using all the classifier built using adaboost on test data
    boosted_predicted_class = ensembleClassify(training_data, training_class, kClassifiers, kClassifiers_errors)
    boosted_predicted_test_class = ensembleClassify(test_data, [], kClassifiers, kClassifiers_errors)

    output = []
    for i in range(0, len(boosted_predicted_test_class)):
        output.append([ref_ids[i], boosted_predicted_test_class[i]])

    with open("output.csv", "wb") as f:
        writer = csv.writer(f)
        writer.writerows(output)
Exemple #2
0
 def train(self):
     if self.uni_gram == True:
         self.nb_uni = NaiveBayes.NaiveBayes(self.train_tweets)
         self.nb_uni.train()
     if self.bi_gram == True:
         self.nb_bi = NaiveBayes.NaiveBayes(self.train_tweets, bi_gram=True)
         self.nb_bi.train()
def ensembleClassify(test_data, test_class, kClassifiers, kClassifiers_errors):
    boosted_predicted_class = []
    k_predictions = []

    for i in range(0, k):
        k_predictions.append(nb.predictClass(test_data, kClassifiers[i]))

    for i in range(0, len(test_data)):
        vote_probability = {}
        for ki in range(0, k):
            if k_predictions[ki][i] not in vote_probability:
                vote_probability[k_predictions[ki][i]] = 0
            if kClassifiers_errors[ki] == 0.0:
                vote_probability[k_predictions[ki][i]] += 0
            else:
                vote_probability[k_predictions[ki][i]] += math.log((1.0 - kClassifiers_errors[ki]) / kClassifiers_errors[ki])

        max_vote = 0
        max_class = None
        for class_label in vote_probability:
            if vote_probability[class_label] > max_vote:
                max_vote = vote_probability[class_label]
                max_class = class_label

        boosted_predicted_class.append(max_class)

    if test_class != []:
        nb.generateMeasures(test_class, boosted_predicted_class)

    return boosted_predicted_class
Exemple #4
0
def fitnessEvaluation(chromosomes, fileName):
    choose = list()
    flag = 0
    count = 0
    fitnessValues = list()
    # print("Chromosomes = ", chromosomes)
    for row in chromosomes:
        count += 1
        choose[:] = []
        # print(row)
        for j in range(0, len(row)):
            # print(row[j], end='')
            if (row[j] == 1):
                choose.append(j)
        # print("Choose : ", choose)
        # print()
        dataset = NaiveBayes.loadCsv(fileName, choose)
        # if (flag==0):
        # 	print (dataset)
        # 	flag=1
        # x is the fitness function value
        # print ("dataset = ",dataset)
        x = NaiveBayes.main(dataset)
        fitnessValues.append(x)
        # print ("count = ",count,"  x = ",x)
    return fitnessValues
def main():
    topic_list = []
    with open('topics.txt') as file:
        for line in file:
            topic_list.append(line.strip('\n'))

    print(topic_list)

    tokenized_training_data = xr.tokenize("Training", topic_list, 200)
    tokenized_test_data = xr.tokenize("Test", topic_list, 50)

    wordmap = xr.create_wordmap(tokenized_training_data)
    vector_training = xr.create_vector(tokenized_training_data, wordmap)
    vector_test = xr.create_vector(tokenized_test_data, wordmap)

    # for dst_measure in range(1,4):
    #     print()
    #     if dst_measure == 1:
    #         print("Euclidian distance")
    #     if dst_measure == 2:
    #         print("Hamming distance")
    #     if dst_measure == 3:
    #         print("Cosine Similarity")
    #
    #     for K in range(1, 6, 2):
    #         print('For K = ', K)
    #         knn.knn(vector_training, vector_test, K, dst_measure)

    # test = knn.idf(vector_test)
    # print(test[0])

    nb.naive_bayes(vector_training, vector_test, topic_list, V=len(wordmap))
def cross_validation_nb(folds_array):
    #Initial values
    corrects = 0
    incorrects = 0

    #Separate train and test data
    for i in range(0,10):
        training_data = []
        test_data = []
        for j in range(0,10):
            if j == i:
                test_data = folds_array[j]
            else:
                training_data = training_data + folds_array[j]

        #Train the algorithm using training data
        train = NaiveBayes.train_nb(training_data)
        #Predict values
        for j in range(0,len(test_data)):
            prediction = NaiveBayes.naive_bayes(test_data[j],train)
            length = len(test_data[j])-1
            #Check if the value is correct
            if prediction == test_data[j][length]:
                corrects = corrects + 1
            else:
                incorrects = incorrects + 1

    return float(corrects)/float(corrects+incorrects)
def main():
    parser = argparse.ArgumentParser(description="Parse Values.")
    parser.add_argument('-arg1', 'trainPath', type=str, required=True)
    parser.add_argument('-arg2', 'testPath', type=str, required=True)
    parser.add_argument('-arg3', 'n', type=int, required=True)
    parser.add_argument('-arg4', 'lamda', type=float, required=True)
    args = parser.parse_args()

    trainPath = args.trainPath
    testPath = args.testPath
    n = args.n
    lamda = args.lamda

    nbModel = NaiveBayes()

    inout = io.IO()

    trainSet = inout.readDocuments(trainPath, n)
    testSet = inout.readDocuments(testPath, n)

    nbModel.train(trainSet)

    for doc in testSet:
        bestLanguage = nbModel.mostLikelyLanguage(doc.text, lamda)
        print(id + "|" + bestLanguage)
Exemple #8
0
def output_test_file(input_filename, output_filename):
   #class, gender, and ticket fare
#   KNN_classifier = KNN(5, [test_columns.Pclass,test_columns.Sex,test_columns.Fare])
   train_data = load_data('train.csv', 'train')

   bin_data(train_data)
#   attributes = [ x for x,y in enumerate(att_values) if (y != 'skip' and x != 0)]
#   DecisionTreeClassifier = DecisionTree(train_data, attributes,'')
   NBClassifier = \
   NaiveBayes([test_columns.PassengerId,test_columns.Sex,test_columns.Fare,test_columns.Pclass,test_columns.Age])

   test_data = load_data(input_filename, 'test');
   output_file_object = csv.writer(open("%s" % output_filename, 'wb'))
   output_file_object.writerow(["Survived", "PassengerID"])

#   for row in test_data:
#      if row[test_columns.Sex] == 'female':
#         row[test_columns.Sex] = 0.0
#      else:
#         row[test_columns.Sex] = 1.0

   bin_data(test_data)
   for row in test_data:
      if NBClassifier.predict(row) == 1:
         output_file_object.writerow(["1", row[0]])
      else:
         output_file_object.writerow(["0", row[0]])
def cross_validation_nb(folds_array):
    #Initial values
    corrects = 0
    incorrects = 0

    #Separate train and test data
    for i in range(0, 10):
        training_data = []
        test_data = []
        for j in range(0, 10):
            if j == i:
                test_data = folds_array[j]
            else:
                training_data = training_data + folds_array[j]

        #Train the algorithm using training data
        train = NaiveBayes.train_nb(training_data)
        #Predict values
        for j in range(0, len(test_data)):
            prediction = NaiveBayes.naive_bayes(test_data[j], train)
            length = len(test_data[j]) - 1
            #Check if the value is correct
            if prediction == test_data[j][length]:
                corrects = corrects + 1
            else:
                incorrects = incorrects + 1

    return float(corrects) / float(corrects + incorrects)
Exemple #10
0
def recepcion():
    if request.method == 'POST':

        restText = open("rest.txt", "w")
        Age = request.form['inputAge']
        Sex = request.form['inputSex']
        Bp = request.form['inputBp']
        Ch = request.form['inputCh']
        Nak = request.form['inputNak']
        restText.write(Age)
        restText.write('\n')
        restText.write(Sex)
        restText.write('\n')
        restText.write(Bp)
        restText.write('\n')
        restText.write(Ch)
        restText.write('\n')
        restText.write(Nak)
        restText.write('\n')
        restText.close()

        probText = open("prob1.txt", "w")
        texto = str(x.totalidad())
        probText.write(texto)
        probText.close()

        print("1. Prob total desde readpython es: ", x.totalidad())
        print("2. Edad: ", Age)
        print("3. Nak: ", Nak)
    else:
        print("No se ha posteado nada")
    return render_template('menu.html')
Exemple #11
0
def output_test_file(input_filename, output_filename):
    #class, gender, and ticket fare
    #   KNN_classifier = KNN(5, [test_columns.Pclass,test_columns.Sex,test_columns.Fare])
    train_data = load_data('train.csv', 'train')

    bin_data(train_data)
    #   attributes = [ x for x,y in enumerate(att_values) if (y != 'skip' and x != 0)]
    #   DecisionTreeClassifier = DecisionTree(train_data, attributes,'')
    NBClassifier = \
    NaiveBayes([test_columns.PassengerId,test_columns.Sex,test_columns.Fare,test_columns.Pclass,test_columns.Age])

    test_data = load_data(input_filename, 'test')
    output_file_object = csv.writer(open("%s" % output_filename, 'wb'))
    output_file_object.writerow(["Survived", "PassengerID"])

    #   for row in test_data:
    #      if row[test_columns.Sex] == 'female':
    #         row[test_columns.Sex] = 0.0
    #      else:
    #         row[test_columns.Sex] = 1.0

    bin_data(test_data)
    for row in test_data:
        if NBClassifier.predict(row) == 1:
            output_file_object.writerow(["1", row[0]])
        else:
            output_file_object.writerow(["0", row[0]])
Exemple #12
0
def testNaiveBayes():
    X = np.mat(np.loadtxt(r"data\iris\iris.txt", delimiter=","))
    numbers = np.mat([0] * 4)

    nb = NaiveBayes(1)
    nb.train(X, numbers)
    result = nb.predict(X)

    print(X[(X[:, -1] != result).A.flatten(), :].shape[0] / X.shape[0])
Exemple #13
0
def run():
    'Main loop, it gets and processes user input until "bye".'
    print(
        '''Hi there! My name is Mr. Rabbits!                        (\_/)           
Welcome to Mr. Rabbits' Machine Learning Adventure!      (^.^)
Today we will be exploring the difference between       c(> <)
Naive Bayes classification and k-nearest neighbors.
There are two datasets to choose from: Fisher's Iris flower data set or ________.'''
    )
    while True:
        invalid = False
        info = input(
            '''Please let me know which classifier you would like to explore:
(type 'knn' or 'naive bayes' or 'bagging' or 'bye' to exit).\n''')
        if info == 'bye':
            print('Goodbye! Bring me a carrot next time! :3"')
            return
        print(
            "Which dataset will you be exploring today? Fisher's iris flower dataset or Wisconsin breast cancer diagnostics?"
        )
        dataset = input("Type 'FI' or 'BC'\n")

        split = input(
            "What % of the dataset should be split into the training set? (type a value from 0 to 100)\n"
        )
        split = float(split) / 100

        filename = ''
        if dataset == "FI":
            filename = 'iris.csv'
        elif dataset == "BC":
            filename = 'wdbc_clean.csv'

        trainSet = []
        testSet = []

        createDataset(filename, trainSet, testSet, split)

        if info == 'knn':
            k = input("What value should k be? (# of nearest neighbors)\n")
            KNN.run(trainSet, testSet, int(k))
        elif info == 'naive bayes':
            NaiveBayes.run(trainingSet=trainSet, testSet=testSet)
        elif info == 'bagging':
            k = input("What value should k be? (# of nearest neighbors)\n")
            bagSize = input("How big should the bags be?\n")
            bagNum = input("How many bags should I use?\n")
            bagging(int(k), trainSet, testSet, int(bagSize), int(bagNum))
        else:
            invalid = True

        if invalid:
            print(
                "Oops! There was some invalid input somewhere along the way.")
            print("Let's start from the top again.\n")
        else:
            print("Wow! That was fun. Let's do it again.\n")
def run_adaboost(training_data,testing_data,values_in_features,max_index):
	round_error = []
	round_model = []
	round_alpha = []
	for example in training_data:
		example['weight'] = float(1)/float(len(training_data))
	current_round_ctr = 0
	while current_round_ctr < total_rounds:
		sample_training_data = weighted_random_sampling(training_data)
		#print "Done sampling."
		conditional_prob_model= NaiveBayes.train_naive_bayes_get_classifier(sample_training_data,values_in_features)
		#print "Done training on naive bayes model."														#Get the model as got by training on the random sample.
		round_model.append(conditional_prob_model)
		predictions = NaiveBayes.get_predictions_from_model(conditional_prob_model,training_data,max_index)
		error = find_error(training_data,predictions)													#Find the error of the model.
		round_error.append(error)
		#print "Done finding predictions and getting error."
		#print error
		if error >= 0.5:
			break	
		training_data = update_weights(training_data,error,predictions)
		#print "Done updating weights."
		training_data = normalize_weights(training_data)
		#print "Done normalizing weights."
		current_round_ctr += 1
	#print "Done training models for multiple rounds."
	round_alpha = calculate_weight_classifiers(round_error)
	total_classifiers_generated = len(round_error)
	#print "Done calculating alpha."
	adaboost_predictions = [] 																															#Get the boosted predictions for different examples.
	for example in testing_data:
		boosted_prediction = 1
		for current_round_ctr in range(0,total_classifiers_generated):
			predicted_label = "-1"
			features_prob_product_positive = 1.0
			features_prob_product_negative = 1.0
			for feature in range(1,max_index + 1):
				if feature in example:
					pass_value = example[feature]
				else:
					pass_value = 0
				string_lookup = str(feature) + ':' + str(pass_value) + ':' + "+1"
				features_prob_product_positive = float(features_prob_product_positive) * float(round_model[current_round_ctr][string_lookup])
				string_lookup = str(feature) + ':' + str(pass_value) + ':' + "-1"
				features_prob_product_negative = float(features_prob_product_negative) * float(round_model[current_round_ctr][string_lookup])
			if (float(features_prob_product_positive*round_model[current_round_ctr]['prior_positive']) >= 
					float(features_prob_product_negative*round_model[current_round_ctr]['prior_negative'])):
				predicted_label = "+1"
			boosted_prediction = float(boosted_prediction) + float(float(round_alpha[current_round_ctr]) * float(predicted_label))
		if boosted_prediction > 0:
			final_prediction = "+1"
		else:
			final_prediction = "-1"
		adaboost_predictions.append(final_prediction)
	#print "Done with Adaboost predictions."
	return adaboost_predictions
Exemple #15
0
def runNBFace(percent,trainsize,testsize):
    trainingData,trainingLabels,testData,testLabels=extractImages("faces",trainsize,testsize)
    featureFunction = featureFuncLib.basicFeatureExtractorFace_2
    trainFeatures = list(map(featureFunction, trainingData))
    testFeatures = list(map(featureFunction,testData))
    trainFeatures,trainingLabels=xPercent(percent,trainFeatures,trainingLabels)
    start = time.time()
    givenTrue, pyTrue, givenFalse, pyFalse=NaiveBayes.trainFace(trainFeatures,trainingLabels)
    end = time.time()
    predict=NaiveBayes.predictFace(testFeatures, testLabels, givenTrue, pyTrue, givenFalse, pyFalse)
    return percentCorrect(testLabels,predict),abs(end-start)
def get_input(i,t, section_text, tokens, b_dic, headline):
	'''
	User interface for supervised learning. Enables the user to classify
	sections of webpages, or confirm the current guess. Saves changes both to
	b-dic (in memory) and training_tsv file on disk.

	'''

	Helpers.clear_screen()
	print headline
	Helpers.print_progress_bar(t,i)
	print "Token %d of %d" % (i,t)

	# Confirm that we have enough data to make guesses
	can_make_guesses = NaiveBayes.can_make_guesses(b_dic)

	if can_make_guesses:
		guess = NaiveBayes.extract_winner(NaiveBayes.guess(tokens,b_dic))
		print "\nGuess is " + guess.upper() + "\n\n"

		# 'article' -> 'a'
		guess_key = ""
		for key in valid_categories.keys():
			if guess == valid_categories[key]: guess_key = key
		assert guess_key != ""

	print "\n\n\n\n" + section_text
	print "\n*** *** *** *** *** *** *** *** *** *** *** *** *** *** \n"

	cmd = ""
	while True:

		input_msg = "[A] for article, [H] for headline, [S] for spam or junk content, [D] for date, [B] for byline:\n"
		if can_make_guesses: input_msg = "Hit enter to confirm guess of " + guess + " or " + input_msg

		cmd = raw_input(input_msg)

		# 'if not cmd' enables user to just hit enter, and it updates cmd to
		#  the guess value
		if not cmd and can_make_guesses: cmd = guess_key
		else: cmd = cmd.lower()

		if cmd in valid_categories.keys():
			# Save token classification to b_dic
			b_dic = NaiveBayes.train(tokens,valid_categories[cmd], b_dic)
			# Save to the tab file as well, so that that the b_dic can be rebuilt
			Helpers.append_file_utf(("," + cmd ), training_tsv)
			break

		else:
			print "Error: '", cmd.upper(), "' is an invalid command. Please try again."

	return b_dic
Exemple #17
0
def adaboostWithNaiveBayes(data, label, datatype):
    datadim, datanum = shape(data)
    classifiersweight = []  #分类器权重
    classifiers = []  #每个分类器中的每个样本对应的权重,即数据分布
    classifiersdata = []  #每个分类器中的数据
    classifierslabel = []  #每个分类器中数据对应的标签
    sampleweights = [1 / datanum] * datanum  #初始样本权重,即初始数据分布
    classifiers.append([i for i in sampleweights])
    classifiersdata.append(data)
    classifierslabel.append(label)
    result, errorrate = NaiveBayes.naiveBayes(data, label, data, label,
                                              datatype)  #用原始数据集训练第一个贝叶斯分类器
    if (errorrate > 0.5):
        print('初始分类器不满足要求')
        return
    cweight = 0.5 * math.log((1 - errorrate) / errorrate, math.e)  #初始分类器权重
    classifiersweight.append(cweight)
    #更新样本分布
    for i in range(datanum):
        sampleweights[i] = sampleweights[i] * math.exp(
            -cweight * label[i] * result[i])
    sumweights = sum(sampleweights)
    for i in range(datanum):  #规范化
        sampleweights[i] = sampleweights[i] / sumweights
    classifiers.append([i for i in sampleweights])
    print('第 1 个分类器的误差:', errorrate)

    #重复训练分类器
    T = 2
    for iteration in range(1, T):
        newdata, newlabel, gindices = generateData(data, label, sampleweights)
        #result, errorrate = NaiveBayes.naiveBayes(newdata, newlabel, newdata, newlabel, datatype)
        result, errorrate = NaiveBayes.naiveBayes(data, label, newdata,
                                                  newlabel, datatype)
        print('第', iteration + 1, '个分类器的误差:', errorrate)
        classifiersdata.append(newdata)
        classifierslabel.append(newlabel)
        cweight = 0.5 * math.log((1 - errorrate) / errorrate, math.e)
        classifiersweight.append(cweight)
        #更新样本分布
        for i in range(datanum):
            #sampleweights[i] = sampleweights[i] * math.exp(-cweight * label[i] * result[gindices[i]])
            sampleweights[i] = sampleweights[i] * math.exp(
                -cweight * label[i] * result[i])
        sumweights = sum(sampleweights)
        for i in range(datanum):  #规范化
            sampleweights[i] = sampleweights[i] / sumweights
        classifiers.append([i for i in sampleweights])

    return classifiersweight, classifiers, classifiersdata, classifierslabel
def emailTest2():
    #读取email文件
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = NaiveBayes.textParse(
            open('email/spam/%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(1)  #1-垃圾邮件

        wordList = NaiveBayes.textParse(
            open('email/ham/%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(0)  #0-正常邮件

    vocabList = NaiveBayes.createVocabList(docList)
    trainingSet = list(range(50))  # 创建存储训练集的索引值的列表和测试集的索引值的列表
    testSet = []

    for i in range(10):  # 从50个邮件中,随机挑选出40个作为训练集,10个做测试集
        randIndex = int(random.uniform(0, len(
            trainingSet)))  #从一个均匀分布[low,high)中随机采样,注意定义域是左闭右开,即包含low,不包含high.
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    testSampleText = []
    for i in range(10):
        testSampleText.append(fullText[testSet[i]])

    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(NaiveBayes.bagOfWords2Vec(vocabList,
                                                  docList[docIndex]))
        trainClasses.append(classList[docIndex])

    clf = MultinomialNB()
    model = clf.fit(np.array(trainMat), np.array(trainClasses))

    for testEntry in testSampleText:
        testDoc = np.array(NaiveBayes.bagOfWords2Vec(vocabList, testEntry))
        testResult = model.predict(np.array(testDoc).reshape(1, -1))[0]

        print('The testSample is: ', testEntry, '\n')
        print('It is classified as : ', testResult, '\n')
        print('------------------------------------------------')
Exemple #19
0
def predicting_using_naivebayes(train_mat_with, train_por_with, test_mat_with, test_por_with):

    # get temp vars
    totalsize_mat, probability_n_mat, probability_p_mat = NABE.calculate_probability_with(train_mat_with)
    totalsize_por, probability_n_por, probability_p_por = NABE.calculate_probability_with(train_por_with)

    # get final score
    f_score_mat, accuracy_mat = NABE.naive_bayes_with(totalsize_mat, probability_n_mat, probability_p_mat, test_mat_with)
    f_score_por, accuracy_por = NABE.naive_bayes_with(totalsize_por, probability_n_por, probability_p_por, test_por_with)

    # show results
    print('NABE Mat With G1, G2: Accuracy: ' + str(accuracy_mat) + '  f_score: ' + str(f_score_mat))
    print('NABE Por With G1, G2: Accuracy: ' + str(accuracy_por) + '  f_score: ' + str(f_score_por))

    return 0
def binary_naive_bayes():
    model = nb.NaiveBayesModel()
    clean = cn.DataCLean()
    doc_vector = dv.DocumentVector()
    df_clean, uniqueWords = clean.Clean()
    df_clean_test, df_clean_train = split(
        df_clean, 0, int(.3 * (df_clean['class'].count())))
    docVector = doc_vector.binary_docvector(df_clean_train, uniqueWords)
    # print(docVector)
    df_WordGivenPI, df_WordGivenNoPi, Prob_PI, Prob_NoPI, numWordsInPI, numWordsInNoPI = model.TrainModel(
        docVector, uniqueWords)
    # print("Model Trained")
    predict_df, test_data = model.predict(Prob_PI, Prob_NoPI, uniqueWords,
                                          df_WordGivenPI, df_WordGivenNoPi,
                                          numWordsInPI, numWordsInNoPI,
                                          df_clean_test, clean)

    print(
        "--------------Binary Naive Bayes Accuracy Stats---------------------------"
    )
    stats = em.Evaluate()
    TP, FN, TN, FP = stats.confusion_matrix(test_data, predict_df)
    print("Accuracy = ", stats.Accuracy(TP, TN, FP, FN))
    print("Precision = ", stats.Precision(TP, FP))
    print("Recall = ", stats.Recall(TP, FN))
    print("fScore = ", stats.fScore(TP, FN, FP))
    print("True Negative = ", stats.TrueNegative(TN, FP))
    print(
        "---------------------------------------------------------------------"
    )
Exemple #21
0
def handle_data(train_set, test_set):
    '''
    get the predictions for three algorithms - decision tree, knn and naive bayes
    :param train_set: x
    :param test_set: y
    predict the y_hat, calc the accuracy and write to file the accuracies + the tree
    '''
    # split the files and get the data and labels
    train_data, train_data_labels, attributes, label_key = split_train_data(
        train_set)
    test_data, test_data_labels = split_test_data(test_set)
    # get the algorithms
    decision_tree, knn, naive_bayes = DecisionTree.Model(), Knn.Model(
    ), NaiveBayes.Model()
    algorithms = [decision_tree, knn, naive_bayes]
    accuracies = []
    # for every algorithm - get the prediction on the test set, calc the accuracy and add to list
    for algorithm in algorithms:
        algorithm.set_data(train_data, train_data_labels,
                           [label_key, attributes])
        prediction = algorithm.predict(test_data)
        accuracy = get_acc(prediction, test_data_labels)
        accuracies.append(
            "{0:.2f}".format(accuracy))  # get the 2 digits after point
    # get the output tree and write to the file
    tree = decision_tree.get_tree()
    tree.write_tree(OUTPUT_FILE)
    # write the accuracies to the same file
    write_accuracies(OUTPUT_FILE, accuracies)
def s(x):
    log1,log2 = logistic_regression.predict(x)
    svm1,svm2 = SVM.predict(x)
    nb1,nb2 = NaiveBayes.predict(x)
    X = np.concatenate((log1.reshape(len(log1),1) , log2.reshape(len(log2),1), svm1.reshape(len(svm1),1), svm2.reshape(len(svm2),1),nb1.reshape(len(nb1),1),nb2.reshape(len(nb2),1)),axis = 1)
    prediction = model.predict(X)
    return prediction
Exemple #23
0
def test():
    model = nb.NaiveBayesModel()
    path = 'E:/DATA/Sem8/fyp/Training.csv'
    final_df, df = model.extract('E:/DATA/Sem8/fyp/merge.csv')
    count = 0
    start = -200
    end = 0
    accuracy = []
    precision = []
    recall = []
    fscore = []
    stats = em.Evaluate()
    for count in range(5):
        df_test, df_train = split(final_df, start+200, end+200)
        print(df_train)
        li_clean_text = model.clean_data(df_train)
        uniqueWords = model.make_unique_li(li_clean_text)
    # # print(uniqueWords)
        docVector = model.binary_docvector(final_df, uniqueWords)
        df_WordGivenPI,df_WordGivenNoPi,Prob_PI,Prob_NoPI,numWordsInPI,numWordsInNoPI = model.TrainModel(docVector, uniqueWords)
        predict_df, test_data = model.Predict(Prob_PI, Prob_NoPI, uniqueWords, df_WordGivenPI, df_WordGivenNoPi, numWordsInPI, numWordsInNoPI)
        # print("--------------Naive Bayes Accuracy Stats---------------------------")
        TP, FN, TN, FP = stats.confusion_matrix(test_data, predict_df)
        accuracy.append(stats.Accuracy(TP, TN, FP, FN))
        precision.append(stats.Precision(TP, FP))
        recall.append(stats.Recall(TP, FN))
        fscore.append(stats.fScore(TP, FN, FP))
        # print("---------------------------------------------------------------------")
    print("accuracy = ",Average(accuracy))
    print("precison = ", Average(precision))
    print("recall = ", Average(recall))
    print("f-score = ", Average(fscore))
Exemple #24
0
def filter_article_div(score_dic, article_div, article_dic):
	'''
	Iterates through the article div, removing anything that the
	classifier says it not article text. Also updates the article_dic with
	a new headline (if found).
	'''

	headline_max = 0

	# Again, like above, need to make multiple loops to make sure
	# that the extract command actually takes.
	for x in range(0,3):
		for child in article_div.contents:
			rankings = score_dic[child]
			guess = NaiveBayes.extract_winner(rankings)
			if guess == "headline":
				# If we some how guess that more than one element is the
				# headline, want to make sure we get the one with
				# the highest score
				if rankings['headline'] > headline_max:
					article_dic['headline'] = child.get_text
					headline_max = rankings['headline']
			if guess != "article":
				child.extract()

	return article_dic
Exemple #25
0
def prediction():
    content = {}
    content['message'] = str(session['email_msg'])

    word_list = utils.clean_message(content['message'])
    print(word_list)
    word_df = utils.make_dataframe([word_list])
    print(word_df.index[0])
    sparse_df = utils.make_sparse_matrix(word_df, word_index)
    sparse_df = np.array(sparse_df)
    full_df = nb.make_full_matrix(sparse_df, VOCAB_SIZE)
    full_df = np.array(full_df)
    output = nb.predict(full_df)
    session['status'] = output[0]
    return redirect(url_for("results"))
    return render_template('prediction.html', results=output[1])
Exemple #26
0
def main():
    """
    main method
    :return:
    """
    # get data
    train_df, test_df = generate_df("data/review_polarity/txt_sentoken")

    # separate training data into data, train_labels
    train_labels = pd.DataFrame(train_df["category"])
    train_df = train_df["text"]

    # create model
    nb = NaiveBayes.NaiveBayes()

    # train
    nb.fit(train_df, train_labels)

    # predict
    output = nb.predict(test_df)

    # check accuracy
    df = pd.DataFrame()
    df['guess'] = output['guess']
    df['actual'] = test_df['category']

    df['correct'] = df['guess'] == df['actual']

    print df
    print np.mean(df['correct'])
Exemple #27
0
def given_real_data_test():
    patients = pandas.DataFrame.from_csv(
        './data/training_SyncPatient.csv').reset_index()
    transcripts = pandas.DataFrame.from_csv(
        './data/training_SyncTranscript.csv').reset_index()
    transcripts = transcripts[transcripts['Height'] > 0]
    transcripts = transcripts[transcripts['Weight'] > 0]
    transcripts = transcripts[transcripts['BMI'] > 0]
    joined_df = patients.merge(transcripts, on='PatientGuid', how='inner')
    final_df = joined_df.groupby('PatientGuid').first().reset_index()

    female_set = final_df.ix[np.random.choice(
        final_df[final_df['Gender'] == 'F'].index, 500)]
    male_set = final_df.ix[np.random.choice(
        final_df[final_df['Gender'] == 'M'].index, 500)]
    training_data = [(x[2], (x[8], x[9], x[10])) for x in female_set.values]
    training_data += [(x[2], (x[8], x[9], x[10])) for x in male_set.values]
    classifier = NaiveBayes.Classifier()
    for class_label, input_data in training_data:
        classifier.train(classification=class_label, observation=input_data)

    # Manual verification
    pprint.pprint(classifier._calculate_model_parameters())

    # Men
    print("Men")
    print(classifier.classify(observation=(71.3, 210.0, 23.509)))
    print(classifier.classify(observation=(66.0, 268.8, 27.241999999999997)))
    print(classifier.classify(observation=(65.0, 284.0, 30.616)))
    print("Women")
    print(classifier.classify(observation=(60.5, 151.0, 29.002)))
    print(classifier.classify(observation=(60.0, 148.0, 28.901)))
    print(classifier.classify(observation=(60.0, 134.923, 26.346999999999998)))
    assert True, "Always pass until we want to manually evaluate."
Exemple #28
0
def main():

	# trainFile = "../../data/spambase/missing_values/{}_percent_missing_train.txt"
	# testFile = "../../data/spambase/missing_values/{}_percent_missing_test.txt"
	set_printoptions(threshold='nan')
	Accuracy_train = ones(10)
	Accuracy_test = ones(10)
	for i in range(10):
		print "Working on data with {} testing set".format(i)
		# Step 1: loading data
		print "Loading data..."
		# trainX, trainY, testX, testY = util.loadData(trainFile.format(i*10), testFile.format(i*10))
		# data = loadtxt('../../data/spambase/spambase.data', delimiter=',')
		# trainX, trainY, testX, testY = util.initialData(data)
		##### gammas
		data = loadtxt('../../data/spambase/spambase.data', delimiter=',')
		trainX, trainY, testX, testY = util.initialGammaData(data, i)

		# # Step 2: training data
		print "Training data..."
		# model = NaiveBayes.train(trainX, trainY)
		# model = NaiveBayes.train_missing_value(trainX, trainY)
		##### gammas
		model = NaiveBayes.train_gamma(trainX, trainY)


		# # Step 3: predict test data
		print "Predicting data..."
		# predict_y = NaiveBayes.test(testX, model)
		# predict_y = NaiveBayes.test_missing_value(testX, model)
		##### gammas
		# train_y = NaiveBayes.test_gamma(trainX, model)
		test_y = NaiveBayes.test_gamma(testX, model)

		# # # Step 4: Calculate the Accuracy.
		print "Accuracy..."
		# accuracy = sum(predict_y == testY) / float(testY.size)
		# print "Accuracy on testing : {:.2f}%".format(accuracy*100)
		# print "....Done...."
		##### gammas
		Accuracy_train[i] = sum(train_y == trainY) / float(trainY.size)
		print "Accuracy on training : {:.2f}%".format(Accuracy_train[i]*100)
		Accuracy_test[i] = sum(test_y == testY) / float(testY.size)
		print "Accuracy on test : {:.2f}%".format(Accuracy_test[i]*100)

	print "Total average accuracy on training: {:.2f}%".format(mean(Accuracy_train)*100)
	print "Total average accuracy on testing: {:.2f}%".format(mean(Accuracy_test)*100)
Exemple #29
0
 def __init__(self, filename,classifier='NaiveBayes'):
     self.classifier = NB.NaiveBayes()
     self.filename = filename
     data = pd.read_csv(filename, header=None, \
                             delimiter="\t", quoting=3)
     self.corpus = data[1]
     self.labels = data[0]
     self.build_vocab(self.corpus)
Exemple #30
0
def tarea1(entrenamiento, prueba):
    d = Main()
    (t_0, t_1) = d.split(entrenamiento)
    nb = NaiveBayes.NaiveBayes(entrenamiento, t_1, t_0, prueba)
    nb.plot()
    b = Bayes.Bayes(entrenamiento, t_1, t_0, prueba)
    b.plot()
    return
Exemple #31
0
def given_one_observation_for_two_classes_test():
    classifier = NaiveBayes.Classifier()
    classifier.train(classification='a class', observation=0)
    classifier.train(classification='b class', observation=100)
    classification = classifier.classify(observation=23.2)
    assert classification is None, "Should classify as the nearest class."
    classification = classifier.classify(observation=73.2)
    assert classification is None, "Should classify as the nearest class."
Exemple #32
0
def main():
    """
    Loads data into partitions, creates a Naive Bayes model based on the train
    data, runs the model on the test data, and evaluates its accuracy.
    """
    opts = util.parse_args()
    train_partition, test_partition = util.read_arff(opts.filename)

    nb_model = NaiveBayes(train_partition)

    examples = test_partition.data
    total = len(examples)
    total_correct = 0

    K = test_partition.K
    confusion_matrix = np.zeros((K, K), int)
    for example in examples:
        y_hat = nb_model.classify(example.features)
        y = example.label
        confusion_matrix[y][y_hat] += 1

        if y_hat == y:
            total_correct += 1

    accuracy = round(total_correct / total, 6)
    accuracy_str = "Accuracy: " + str(accuracy) + " ("
    correct_str = str(total_correct) + " out of " + str(total) + " correct)"
    print(accuracy_str + correct_str)
    stretch = 8
    prediction_labels = "   "
    top_row = "   "
    table = ""
    for y_hat in range(K):
        prediction_labels += " " * (stretch -
                                    len(str(y_hat + 1))) + str(y_hat + 1)
        top_row += "-" * stretch
    for y in range(K):
        table += " " + str(y + 1) + "|"
        for y_hat in range(K):
            entry = str(confusion_matrix[y][y_hat])
            table += " " * (stretch - len(entry)) + entry
        table += "\n"
    print("\n\n        prediction")
    print(prediction_labels)
    print(top_row)
    print(table)
Exemple #33
0
def run():
    file, training, test = get_input()
    print("File Being Used is " + file)
    print("Ratio of Training Data Being Used: " + str(training))
    print("Ratio of Testing Data Being Used: " + str(test))
    cont = input("Continue with this data? (Y/n)")
    if cont == "n":
        run()
    print("IMPORTING  DATA")
    data = nb.get_data_set(file)
    print("CLEANING DATA")
    data = nb.clean_data_set(data)
    print("IMPORTING CLEANED DATA")
    data = nb.get_data_set(data)
    print("SPLITTING THE DATA INTO TRAINING AND TEST DATA")
    training_data, testing_data = nb.split_data(data, training)
    print("LENGTH OF TRAINING DATA  ->  " + str(len(training_data)))
    print("LENGTH OF TESTING DATA   ->  " + str(len(testing_data)))
    print("CREATING CLASS SUMMARY")
    summary = nb.class_summary(training_data)
    print("MAKING PREDICTIONS ON TESTING DATA BASED OFF OF MODEL")
    testing = nb.prediction(summary, testing_data)
    accuracy, right, items = nb.accuracy(testing_data, testing)
    print("ACCURACY IS  ->  {:2}".format(accuracy))
    print("AMOUNT CORRECT IS    ->  {}".format(right))
    print("OUT OF       ->      {}".format(items))
    print("WRITING TO LOG")
    log = open("log.txt", "w")
    log.write("ACCURACY ->  " + str(accuracy) + "\n" +
              "AMOUNT CORRECT     ->      " + str(right) + "\n" +
              "OUT OF     ->      " + str(items))
    log.close()
Exemple #34
0
    def test(self):
        """Test na sztucznych danych."""

        def getfeatures(text):
            """Funkcja do testów."""
            return list(set(text.split()))

        bayes = NaiveBayes.NaiveBayes(getfeatures)

        bayes.feature_count = {('terms,', 'C1'): 1, ('considers', 'C2'): 1,
                    ('independently', 'C3'): 1, ('each', 'C1'): 1,
                    ('that', 'C1'): 1, ('the', 'C3'): 1, ('on', 'C1'): 1,
                    ('features', 'C1'): 1, ('and', 'C3'): 1, ('is', 'C2'): 1,
                    ('feature.', 'C2'): 1, ('For', 'C2'): 1, ('fruit', 'C2'): 1,
                    ('features,', 'C2'): 1, ('classifier', 'C2'): 1,
                    ('(or', 'C2'): 2, ('these', 'C1'): 1, ('the', 'C2'): 2,
                    ('particular', 'C2'): 1, ('may', 'C2'): 1,
                    ('Bayes', 'C2'): 1, ('all', 'C2'): 1, ('feature', 'C2'): 1,
                    ('apple', 'C3'): 1, ('naive', 'C2'): 1, ('depend', 'C1'): 1,
                    ('other', 'C2'): 2, ('if', 'C3'): 1,
                    ('contribute', 'C3'): 1, ('any', 'C2'): 1,
                    ('these', 'C2'): 1, ('4"', 'C3'): 1,
                    ('classifier', 'C1'): 1, ('other', 'C1'): 1,
                    ('of', 'C1'): 1, ('assumes', 'C1'): 1,
                    ('Bayes', 'C1'): 1, ('Even', 'C1'): 1,
                    ('presence', 'C1'): 1, ('the', 'C1'): 2,
                    ('a', 'C2'): 3, ('upon', 'C1'): 1,
                    ('that', 'C3'): 1, ('example,', 'C2'): 1,
                    ('properties', 'C3'): 1, ('this', 'C3'): 1,
                    ('to', 'C2'): 1, ('In', 'C1'): 1,
                    ('round,', 'C3'): 1, ('about', 'C3'): 1,
                    ('absence)', 'C2'): 2, ('of', 'C2'): 3,
                    ('diameter.', 'C3'): 1,
                    ('existence', 'C1'): 1, ('be', 'C3'): 1,
                    ('considered', 'C3'): 1, ('a', 'C1'): 1,
                    ('it', 'C3'): 1, ('an', 'C3'): 1,
                    ('or', 'C1'): 1, ('if', 'C1'): 1,
                    ('presence', 'C2'): 1, ('is', 'C3'): 1,
                    ('to', 'C3'): 2, ('unrelated', 'C2'): 1,
                    ('red,', 'C3'): 1, ('probability', 'C3'): 1,
                    ('naive', 'C1'): 1, ('class', 'C2'): 1,
                    ('in', 'C3'): 1, ('simple', 'C1'): 1}

        bayes.class_count = {'C1': 2, 'C2': 3, 'C3': 2}

        feat_cats = [
            ('of', 'C2'), ('to', 'C3'), ('features', 'C1'),
            ('Bayes', 'C1'), ('of', 'C1'),
            ('to', 'C5'), ('features', 'C3'), ('Bayes', 'C2')]
        probs = [0.0, 0.0, -0.6931,
                 -0.6931, -0.6931,
                 -1e+300, -7.6009, -1.0986]

        for idx in range(len(feat_cats)):
            self.assertAlmostEqual(
                featprob(bayes, feat_cats[idx][0], feat_cats[idx][1]),
                probs[idx], 4)
Exemple #35
0
def spamTest():
    #read datas from text to docList and classList
    docList = []
    classList = []
    fullTest = []
    for i in range(1, 26):
        wordList = textParse(open(r'Datas\email\spam\%d.txt' % i).read())
        docList.append(wordList)
        fullTest.extend(wordList)
        classList.append(1)

        wordList = textParse(open(r'Datas\email\ham\%d.txt' % i).read())
        docList.append(wordList)
        fullTest.extend(wordList)
        classList.append(0)
    vocabList = nb.createVocabList(docList)

    #move Data from trainingSet to testSet
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    #generate train Mat and Classes
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(nb.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    #train NaiveBayes classifier
    p0V, p1V, pSpam = nb.trainNaiveBayes0(array(trainMat), array(trainClasses))

    #test classifier use testSet
    errorCount = 0
    for docIndex in testSet:
        wordVector = nb.setOfWords2Vec(vocabList, docList[docIndex])
        if nb.classifyNaiveBayse(array(wordVector), p0V, p1V,
                                 pSpam) != classList[docIndex]:
            errorCount += 1

    print 'the error rate is : ', float(errorCount) / len(testSet)
def main():
	nb = NaiveBayes()

	## Training Set
	rTrainFeatures = getFeatures("C:\Users\John\Documents\BRMSentimentAnalysis\data\MSFTSmall.txt")
	uTrainFeatures = getFeatures("C:\Users\John\Documents\BRMSentimentAnalysis\data\unrelated.txt")
	features = []
	features.append(rTrainFeatures)
	features.append(uTrainFeatures)
	labels = ["Company", "Unrelated"]
	f = (float)(len(rTrainFeatures))/(len(rTrainFeatures) + len(uTrainFeatures))
	print f
	nb.addExamples(features, labels)

	## Test Set
	test_features = []
	test_features.append("Apple")
	test_features.append("Mac")

	print "The test example", test_features, "should be labeled 'Company', and is in fact labeled: {0}".format(nb.classify(test_features))
Exemple #37
0
 def storeTweetSentiment(self, tweet):
     try:
         related_topic_term = self.isTweetRelatedFF(tweet)[1]
         text = tweet['text']
         sentiment = NaiveBayes.sentimentClassify(text, sentimentTokenizer, stopwords, sentimentClassifier)
         state = getTweetState(tweet)
         
         sentiment_file_writer =  csv.writer(open('FFsentiment.csv','a'), lineterminator='\n')
         sentiment_file_writer.writerow((related_topic_term, tweet['text'], sentiment, state))
     except:
         e = sys.exc_info()[0]
         print( "<p>Error: %s</p>" % e )
 def learnClassifer(self):
    model = NaiveBayes()
    dict = {};
    dict['cases'] = 1
    attributes = []
    for j in range(len(self.featureFactory.datatable)):
        dict = {};
        dict['cases'] = 1
        dict['attributes'] = {}
        line = self.featureFactory.datatable[j]
        for i in range(len(line)):
            dict['attributes'][str(i)] = line[i]
            attributes.append(str(i))
        dict['label'] = self.featureFactory.classes[j]
        model.add_instances(dict)
    model.set_real(attributes)
    model.train()
    self.model = model
    return pickle.dumps(model).encode('string_escape')
Exemple #39
0
def saveCounts():
    with open('total-counts.csv', 'w') as counts:
            counts= csv.DictWriter(counts, states.keys())
            counts.writeheader()
            counts.writerow(states)
    
if __name__ == '__main__':

    #geolocator = GoogleV3()

    tweets = open('test.json', 'w', encoding='utf-8')
    tweets.close()
    
    sentiment_file_writer =  csv.writer(open('FFsentiment.csv','w'), lineterminator='\n')
    
    sentimentClassifier = NaiveBayes.getSentimentClassifier()
    print("Sentiment Classifier Created")
    
    sentimentTokenizer = happytokenizer.TweetTokenizer()

    #This handles Twitter authentication and the connection to Twitter Streaming API
    Tweetlistener = StdOutListener()
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    stream = Stream(auth, Tweetlistener)

    #This line filter Twitter Streams to capture data posted from US in English with utf-8 encoding
    while True:
        try:
            stream.filter(languages=['en'], async=False, locations=[-125,25,-65,48])
        except:
Exemple #40
0
def main():

    trainingFiles, testFiles = getYELPFiles()
    print 'importing ~230,000 reviews...'
    reviewsList = importJSONFiles([trainingFiles[REVIEW]])[0]
    print 'import finished'

    # construct list "y" of scores
    scoreVector = np.asmatrix([review['votes']['useful'] for review in reviewsList]).T

    # GENERATE GRAPH
    #################################
    Graphs.helpfulDist(reviewsList)
    #################################

    # CONCURRENT REGRESSION CONFIGURATIONS
    ###############################################################################################
    pid1 = os.fork()
    if pid1 == 0:
        weightVector, RMSLE = regSentences(reviewsList, scoreVector)       # RMSLE = 0.6447
        exit(RMSLE)

    pid2 = os.fork()
    if pid2 == 0:
        weightVector, RMSLE = regLines(reviewsList, scoreVector)           # RMSLE = 0.6382
        exit(RMSLE)

    pid3 = os.fork()
    if pid3 == 0:
        weightVector, RMSLE = regLinesSqrLines(reviewsList, scoreVector)   # RMSLE = 0.6371
        exit(RMSLE)

    pid4 = os.fork()
    if pid4 == 0:
        weightVector, RMSLE = regLinesLogLines(reviewsList, scoreVector)   # RMSLE = 0.6365
        exit(RMSLE)

    pid5 = os.fork()
    if pid5 == 0:
        weightVector, RMSLE = regLinesSentences(reviewsList, scoreVector)  # RMSLE = 0.6320
        exit(RMSLE)

    pid6 = os.fork()
    if pid6 == 0:
        weightVector, RMSLE = regUserScores(reviewsList, scoreVector, trainingFiles)  # RMSLE = 0.5330
        exit(RMSLE)
    weightVector, RMSLE = regUserScores(reviewsList, scoreVector, trainingFiles)  # RMSLE = 0.5330

    pid7 = os.fork()
    if pid7 == 0:
        weightVector, RMSLE = regLogLinesLogSentences(reviewsList, scoreVector)  # RMSLE = 0.6340
        exit(RMSLE)

    RMSLE1 = os.waitpid(pid1,0)
    RMSLE2 = os.waitpid(pid2,0)
    RMSLE3 = os.waitpid(pid3,0)
    RMSLE4 = os.waitpid(pid4,0)
    RMSLE5 = os.waitpid(pid5,0)
    RMSLE6 = os.waitpid(pid6,0)
    RMSLE7 = os.waitpid(pid7,0)
    ###############################################################################################



    # REGRESSION (with testing) ON ADJECTIVES AND ADVERBS  RMSLE = 0.6329
    #################################################################################
    # CONCURRENT training  (set desired number of training reviews to use inside the method)
    weightVector, RMSLE = concurrentFeatureExtractor(reviewsList, scoreVector)

    # SEQUENTIAL training  (set desired number of training reviews to use inside the method)
    weightVector, RMSLE = regLinesAdjAdv(reviewsList, scoreVector)

    # concurrent testing
    TestSet.testConcAdjAdv(testFiles, weightVector)
    #################################################################################



    # 2 other possible test configurations
    #################################################################
    weightVector, RMSLE = regLinesSentences(reviewsList, scoreVector)
    TestSet.testLinesSentences(testFiles, weightVector)
    #################################################################




    # NAIVE BAYES
    ####################################################
    NaiveBayes.probScoreGivenCategories(trainingFiles)
    ####################################################

    print '\nGot to the end, Terminating...'
#                            #
# Main program               #
#                            #
##############################
#Read information from the command line
file = sys.argv[1]
examples = sys.argv[2]
algorithm = sys.argv[3]

#Check with algorithm will be used
if algorithm != 'NB':
    algorithm = algorithm.replace("NN", "")

#Read data and train it for Naive Bayes
data = Helper.readfile(file)
train = NaiveBayes.train_nb(data)

#Read example data
f = open(examples, 'r')

#Test every example
for line in f:
    array_line = line.split(',')
    row = []
    length = len(array_line)
    for i in range (0,length):
        row.append(float(array_line[i]))

    #Apply the algorithm
    if algorithm != 'NB':
        print KNN.knearest(int(algorithm),data,row)
def formEnsembleClassifiers(training_class, training_data, max_attribute_values, k, max_run=5):

    num_training_data = len(training_class)

    ensemble_classifiers = []
    errors_Mi = []

    tuple_weights = [(1.0 / len(training_data)) for i in range(0, len(training_data))]
    #tuple_ids = [i for i in range(0, len(training_class))]

    for rk in range(0, k):
        run_emis = []
        run_classifiers = []
        run_predictions = []
        run_training_class = []

        current_run = 0
        while True:
            new_training_data = []
            new_training_class = []
            prefixed_weights = prefixScan(tuple_weights)
            for i in range(0, num_training_data):
                pick_id = drawRandomPD(prefixed_weights)#random.choice(tuple_ids)
                new_training_data.append(training_data[pick_id])
                new_training_class.append(training_class[pick_id])

            Mi = nb.makeClassifier(new_training_class, new_training_data, max_attribute_values)
            predicted_class = nb.predictClass(new_training_data, Mi)

            eMi = findMiError(predicted_class, new_training_class, tuple_weights)

            if eMi < 0.5:# and (errors_Mi != [] and eMi < min(errors_Mi)):
                ensemble_classifiers.append(Mi)
                errors_Mi.append(eMi)
                break

            run_emis.append(eMi)
            run_classifiers.append(Mi)
            run_predictions.append(predicted_class)
            run_training_class.append(new_training_class)

            current_run += 1
            #
            # if current_run == max_run:
            #     eMi = max(run_emis)
            #     min_run_id = run_emis.index(eMi)
            #     ensemble_classifiers.append(run_classifiers[min_run_id])
            #     predicted_class = run_predictions[min_run_id]
            #     new_training_class = run_training_class[min_run_id]
            #     errors_Mi.append(eMi)
            #     break

        print eMi

        new_tuple_weights = assignNewTupleWeights(tuple_weights, eMi, predicted_class, new_training_class)
        if new_tuple_weights == None:
            rk -= 1
            continue
        tuple_weights = new_tuple_weights
        #tuple_ids = makeNewTupleIds(tuple_weights, num_training_data)

    return ensemble_classifiers, errors_Mi
data = [
    [5.92, 190, 11],
    [5.58, 170, 12],
    [5.92, 165, 10],
    [5, 100, 6],
    [5.5, 150, 8],
    [5.42, 130, 7],
    [5.75, 150, 9],
    [6, 180, 12],
    [7, 220, 11],
]
labs = ["male", "male", "male", "female", "female", "female", "female", "male", "male"]

pred_data = [[6, 130, 8], [7, 199, 12], [5.42, 170, 8], [5.8, 220, 11]]
node, prior_prob = NaiveBayes.train(data, labs)

output = NaiveBayes.predict(node, prior_prob, pred_data)

for predicted_value in output:
    print predicted_value


### NOW TESTING ITS OUTPUT COMPARED TO sklearn.naive_bayes implementation.

X = array(data)
y = array(labs)

gnb = GaussianNB()

classifier = gnb.fit(X, y)
				features_prob_product_negative = float(features_prob_product_negative) * float(round_model[current_round_ctr][string_lookup])
			if (float(features_prob_product_positive*round_model[current_round_ctr]['prior_positive']) >= 
					float(features_prob_product_negative*round_model[current_round_ctr]['prior_negative'])):
				predicted_label = "+1"
			boosted_prediction = float(boosted_prediction) + float(float(round_alpha[current_round_ctr]) * float(predicted_label))
		if boosted_prediction > 0:
			final_prediction = "+1"
		else:
			final_prediction = "-1"
		adaboost_predictions.append(final_prediction)
	#print "Done with Adaboost predictions."
	return adaboost_predictions

		
if __name__ == "__main__":
	if(len(sys.argv)) != 3:
		print NaiveBayes.usage("NBAdaBoost.py")
		sys.exit(1)
	else:
		train_file_name = sys.argv[1]
		test_file_name = sys.argv[2]
		train_file = open(train_file_name,"r")
		test_file = open(test_file_name,"r")
		training_data,testing_data,values_in_features,max_index = NaiveBayes.process_files(train_file,test_file)
		train_file.close()
		test_file.close()
		adaboost_predictions = run_adaboost(training_data,training_data,values_in_features,max_index)
		NaiveBayes.print_metrics(training_data,adaboost_predictions)
		adaboost_predictions = run_adaboost(training_data,testing_data,values_in_features,max_index)
		NaiveBayes.print_metrics(testing_data,adaboost_predictions)
Exemple #45
0
                    else:
                        if (int(df[df['날짜'] >= issueDate].tail(1)['종가']) < int(df[df['날짜'] < issueDate].head(1)['종가'])):
                            print('down')
                            docList.append(wordList)
                            classList.append(0)
                        else:
                            print('hold')
                            docList.append(wordList)
                            classList.append(0)
            except:
                continue

        else:
            pass

    vocaList = NaiveBayes.createVocabList(docList)

    trainMat = []
    for postinDoc in docList:
        trainMat.append(NaiveBayes.setOfWords2Vec(vocaList, postinDoc))

    print('vocaList : ', vocaList)
    print('trainMat : ', trainMat)
    print('testEntry : ', testEntry)
    p0V, p1V, pAb = NaiveBayes.trainNB0(array(trainMat), array(classList))

    # testEntry = ['카카오', '인공지능', '알파고']
    thisDoc = array(NaiveBayes.setOfWords2Vec(vocaList, testEntry))
    print(testIssueDate, ' 일자의 ', testTitle, '기사 이후 주가는 ?\n', NaiveBayes.classifyNB(thisDoc, p0V, p1V, pAb))
else:
    print("Error Code:" + rescode)