def main(): print("#--- zerorok -----------------------") print("weathernon") f = open("weather.csv", "r") # ZeroR(f,3) abcd = Abcd() abcd.Abcds(f, 3, ZeroR(Tbl())) f.close() print("diabetes") f = open("diabetes.csv", "r") # ZeroR(f,3) abcd = Abcd() abcd.Abcds(f, 3, ZeroR(Tbl())) f.close() print("#--- nbok -----------------------") print("weathernon") f = open("weather.csv", "r") abcd2 = Abcd() abcd2.Abcds(f, 4, NB(Tbl())) f.close() print("diabetes") f = open("diabetes.csv", "r") abcd2 = Abcd() abcd2.Abcds(f, 5, NB(Tbl())) f.close()
def buildNBInput(documents, allTokens): nbInput = [] for doc in documents: input = {} input[0] = int( float(doc.readability) + 0.5) input[1] = doc.name for i in range(len(allTokens)): word = allTokens[i] if word in doc.tokens: input[i+2] = int(doc.tokens[word]) nbInput.append(input) nbTest = [] nbTraining = [] random.shuffle(nbInput) for t in nbInput: if random.random() >= 0.8: nbTest.append(t) else: nbTraining.append(t) print "Using %d intances: %d for training and %d for test" %(len(nbInput), len(nbTraining), len(nbTest)) #nbInput is read! nb = NB() nb.trainClassifier(nbTraining) nb.testInBatch(nbTest)
def main(): train_filename = sys.argv[1] test_filename = sys.argv[2] if sys.argv[3][1] == 'N': k_value = int(sys.argv[3][0]) knn = KNN() train_data, label_data = knn.getTrainData(train_filename) test_data = knn.getData(test_filename) for i in range(len(test_data)): neighbors = knn.get_neighbors(train_data, label_data, test_data[i], k_value) print(knn.vote(neighbors)) else: nb = NB() train_data = nb.getData(train_filename) test__data = nb.getData(test_filename) result = nb.getPredictions(test__data, train_data) # separated = nb.separateByClass(train_data) # print(separated[1][separated[1]=='yes'].count()) # print(result[20][8],result[21][8],result[22][8]) for i in range(len(result)): if result[i][8] == 1: print('yes') else: print('no')
def main(): #production #logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.INFO, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S') #testing logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.DEBUG, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S') simpleApi = myWikiApi("http://simple.wikipedia.org/w/api.php?") enApi = myWikiApi("http://en.wikipedia.org/w/api.php?") maxDepth = 15 dataDir = "data" #Create or load training set arffh = ArffHandler() arffFileCreated = generateTraining(enApi, "training", arffh) [featureNames, featureVector] = arffh.readArffFile(arffFileCreated) nb = NB() nb.trainClassifier(featureVector) initialCategories = [["Category:Medicine"]] for category in initialCategories: simpleApi.getAllSubCategories(category, maxDepth, nb, arffh) simpleCategories = simpleApi.getVisitedCategories() simplePageTitles = [] logging.info("Total number of simple categories used: %d", len(simpleCategories)) for category in simpleCategories: simplePageTitles += simpleApi.getAllPagesBelongingToACategory(category) equals = 0 total = 0 for page in simplePageTitles: print page pageFileName = re.sub(" ","_",page) simpleFile = open(dataDir + "/" + pageFileName + ".simple", "w") enFile = open(dataDir + "/" + pageFileName + ".en", "w") simpleContent = simpleApi.getPageContent(page, bagOfWords=False) enContent = enApi.getPageContent(page, bagOfWords=False) if simpleContent and enContent: for w in enContent: enFile.write("%s" % w) for w in simpleContent: simpleFile.write("%s" % w) #print "simpleContent:", simpleContent #print "enContent:", enContent print "Equal?", simpleContent == enContent if simpleContent == enContent: equals += 1 total += 1 print ("Number of equals = %d . Total number = %d" % (equals, total))
def main(): #Simple argument consistency check #Exit if arguments not specified if len(sys.argv) != 3: print("usage: python3 nbclassify.py MODELFILE TESTFILE") sys.exit(0) #Create an instance of Naive Bayes class in classification mode naive_bayes = NB('CLASSIFY') #Load the model file into the class naive_bayes.load(sys.argv[1]) #Get the predictions from the classifier predictions = naive_bayes.classify(sys.argv[2]) #print out the predictions to stdout for prediction in predictions: print(prediction)
def create_architecture(self, offsets=None, mode='add', in_proba=False): ## generate abiotic and biotic covariates self.gen_covs() ## Abiotic response x_abio = self.fe_env_net(self.env_feat) ## Here, replace following dense layer by another network that yields regression parameters given traits if in_proba: print('HSM proba given') abio_resp = x_abio else: abio_resp = tfkl.Dense( self.m, use_bias=self.hsm_config['archi']['fit_bias'])(x_abio) ## Generate biotic response if self.var_assoc: x_bio = tf.expand_dims(self.fe_bio_net(self.bio_feat), axis=1) else: x_bio = tfkl.Lambda(lambda x: tf.ones((tf.shape(x)[0], 1, self.d)), name=self.model_name + '_loadings')(abio_resp) assoc = self.association(x_bio) bio_resp = tfkl.Dot(axes=1)([self.counts, assoc]) ### Aggregate abiotic and biotic effects drivers = [abio_resp, bio_resp] # if self.im_config['archi']['fit_bias']=='offset': # drivers+=[tf.expand_dims(tf.constant(offsets,dtype=tf.float32),0)] ### aggregation is done using addition here, could be extended to more complex differentiable functions if (self.dist[0] == 'binomial') & (mode == 'bam'): ## Add an intercept (useful for basal species) off_bio_resp = BiasLayer(name=self.model_name + '_indep_offset')(bio_resp) pred = tfkl.Activation('sigmoid')(abio_resp) * tfkl.Activation( 'sigmoid')(off_bio_resp) self.eta_model = tfk.Model(self.inputs, [abio_resp, off_bio_resp]) else: logits = tfkl.Add(name=self.model_name + '_out')(drivers) if self.dist[0] in ['negbin']: self.disp = GlobalParam(self.m, 'disp') logits = self.disp(logits) self.nbr = NB(theta_var=self.disp.kernel) pred = tfkl.Activation(act_fn.get(self.dist[0]))(logits) self.eta_model = tfk.Model(self.inputs, logits) self.pred_model = tfk.Model(self.inputs, pred) self.hsm_model = tfk.Model(self.env_in, abio_resp) if self.var_assoc: self.assoc_model = tfk.Model(self.bio_in, assoc)
def main(): #Simple argument consistency check #Exit if arguments not specified if len(sys.argv) != 3: print("usage: python3 nblearn.py TRAININGFILE MODELFILE") sys.exit(0) #Create an instance of Naive Bayes class in training mode naive_bayes = NB('TRAIN') #Fit the data with the training file naive_bayes.fit(sys.argv[1]) #Train the classifier naive_bayes.train() #Output a model file to the file specified naive_bayes.generate_model(sys.argv[2])
def main(): train_filename = sys.argv[1] # This command is to cross validate the given training data set if sys.argv[2] == 'cross_validate': # cross validate the given training data set with kNN algorithm if sys.argv[3][1] == 'N': k_value = int(sys.argv[3][0]) knn = KNN() print(knn.cross_validation(k_value, train_filename)) # cross validate the given training data set with NB algorithm else: nb = NB() print(nb.cross_validation(train_filename)) # This command will generate a 10‐fold stratified data with given train dataset in required format # The required format is described in assignment documentation elif sys.argv[2] == 'write_stratification': knn = KNN() knn.writeStratification(train_filename, "10_folds_stratified.csv") else: test_filename = sys.argv[2] if sys.argv[3][1] == 'N': k_value = int(sys.argv[3][0]) knn = KNN() train_dataset = knn.getData(train_filename) test_data = knn.getData(test_filename) train_data, label_data = knn.splitAttributeLabels(train_dataset) results = knn.knn_predict(train_data, label_data, test_data, k_value) for i in results: print(i) elif sys.argv[3] == 'NB': nb = NB() train_data = nb.getTrainData(train_filename) test__data = nb.getTestData(test_filename) result = nb.getPredictions(test__data, train_data) for i in range(len(result)): if result[i][-1] == 1: print('yes') else: print('no')
zr = ZeroR() zr.train(t, rows) print("\ndiabetes") zr.dump() # NB print("\n\n#--- Nbok ---------------------") tbl = Tbl("weathernon.csv") rows = [] for lst in tbl.fromString(False, "file"): rows.append(lst) c = Col() tbl.cols = c.colNum(tbl.rows) nb = NB(tbl, 3) nb.train(tbl, rows) print("\nweathernon") nb.dump() print() tbl = Tbl("diabetes.csv") rows = [] for lst in tbl.fromString(False, "file"): rows.append(lst) c = Col() tbl.cols = c.colNum(tbl.rows) nb = NB(tbl, 19) nb.train(tbl, rows) print("\ndiabetes")
from NB import NB from scipy.io import arff import pandas as pd import sys import random import numpy as np # read input def readInput(file): dataset, meta = arff.loadarff(file) data = pd.DataFrame(dataset) # decode all attributes in dataframe for att in meta._attrnames: data[att] = data[att].str.decode('utf-8') return data, meta # define cases trainFile = sys.argv[1] testFile = sys.argv[2] model = sys.argv[3] trainData, meta = readInput(trainFile) testData, meta = readInput(testFile) if model == "n": accuracy = NB(trainData, testData, meta) elif model == "t": accuracy = TAN(trainData, testData, meta)
def __init__(self): NB.__init__(self)
'sci.med', 'sci.space', 'talk.politics.misc' ], columns=[ 'comp.graphics', 'rec.sport.hockey', 'sci.med', 'sci.space', 'talk.politics.misc' ]) plt.figure(figsize=(10, 7)) ax = seaborn.heatmap(df_cm, annot=True) ax.set(xlabel='Ground Truth', ylabel='Predicted') plt.title("TF-IDF/" + str(j) + "/" + str(i)) plt.savefig("plots/TF_IDF_" + str(j) + "_" + str(i) + ".png") # plt.show() # # model = NB() model.fit(trainx, trainy) result = model.predict(testx) acr = accuracy(result, testy) # f = open('accuracy.txt', mode='a') # f.write("NA_NA_" + str(j) + "_" + str(i) + " " + str(acr) + "\n") f.close() conf = confusion_matrix(testy, result, labels=[ 'comp.graphics', 'rec.sport.hockey', 'sci.med', 'sci.space', 'talk.politics.misc' ])
print('parents of each nodes(attributes):\n', TAN_data.parents, '\n') print('\n所有平均交叉结果:\n', round(sum(loss01)/len(loss01), 3)) if Global_V.SCHEME.upper() == 'KDB': pass if Global_V.SCHEME.upper() == 'AODE': pass if Global_V.SCHEME.upper() == 'NB': result = [] fold_count = 0 for CrossTest_count in range(0, data_len, data_len // 10): # 交叉验证 p_c_fold = [] loss01_fold = 0 NB_data = NB(data_initial, 0.9, CrossTest_count) NB_data.train() p_c_fold, result_fold = NB_data.classify() result.append(result_fold) # print('\n第', fold_count, '次交叉结果:\n') print('\n第', fold_count, '次交叉') loss01_fold = estimate_Output(NB_data.testData, p_c_fold, result_fold, 1) loss01.append(loss01_fold) fold_count += 1 # 每个分类的过程,必须保存:1. 每一个testdata里的instance分类到每个Ci的概率p_y 2. 分类结果:result # output(result, TAN_car.testData,1) # output 函数的输入为: # output的输出,应该要有:1.每个testdata分到每个类的概率 2.分类结果result 3.testData,(以及参数1,2,3;不同的参数,输出不同) # compare the prediction results with the real results if Global_V.PRINTPAR == 3: print('testset:\n', NB_data.testData, '\n') print('p_ci= \n', p_c_fold, '\n', 'result:\n', result)