def main():
    print("#--- zerorok -----------------------")
    print("weathernon")
    f = open("weather.csv", "r")
    # ZeroR(f,3)
    abcd = Abcd()
    abcd.Abcds(f, 3, ZeroR(Tbl()))
    f.close()
    print("diabetes")
    f = open("diabetes.csv", "r")
    # ZeroR(f,3)
    abcd = Abcd()
    abcd.Abcds(f, 3, ZeroR(Tbl()))
    f.close()
    print("#--- nbok -----------------------")
    print("weathernon")
    f = open("weather.csv", "r")
    abcd2 = Abcd()
    abcd2.Abcds(f, 4, NB(Tbl()))

    f.close()
    print("diabetes")
    f = open("diabetes.csv", "r")
    abcd2 = Abcd()
    abcd2.Abcds(f, 5, NB(Tbl()))
    f.close()
def buildNBInput(documents, allTokens):
    nbInput = []
    
    for doc in documents:
        input = {}
        input[0] = int( float(doc.readability) + 0.5)
        input[1] = doc.name
        
        for i in range(len(allTokens)):
            word = allTokens[i]

            if word in doc.tokens:
                input[i+2] = int(doc.tokens[word])
        nbInput.append(input)
    
    nbTest = []
    nbTraining = []
    random.shuffle(nbInput)
    for t in nbInput:
        if random.random() >= 0.8:
            nbTest.append(t)
        else:
            nbTraining.append(t)
    
    print "Using %d intances: %d for training and %d for test" %(len(nbInput), len(nbTraining), len(nbTest))
    #nbInput is read!
    nb = NB()
    nb.trainClassifier(nbTraining)

    nb.testInBatch(nbTest)
def main():
    train_filename = sys.argv[1]
    test_filename = sys.argv[2]
    if sys.argv[3][1] == 'N':
        k_value = int(sys.argv[3][0])
        knn = KNN()
        train_data, label_data = knn.getTrainData(train_filename)
        test_data = knn.getData(test_filename)
        for i in range(len(test_data)):
            neighbors = knn.get_neighbors(train_data, label_data, test_data[i],
                                          k_value)
            print(knn.vote(neighbors))

    else:
        nb = NB()
        train_data = nb.getData(train_filename)
        test__data = nb.getData(test_filename)
        result = nb.getPredictions(test__data, train_data)
        # separated = nb.separateByClass(train_data)
        # print(separated[1][separated[1]=='yes'].count())
        # print(result[20][8],result[21][8],result[22][8])
        for i in range(len(result)):
            if result[i][8] == 1:
                print('yes')
            else:
                print('no')
def main():
    #production
    #logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.INFO, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S')
    #testing
    logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.DEBUG, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S')
    
    simpleApi = myWikiApi("http://simple.wikipedia.org/w/api.php?")
    enApi = myWikiApi("http://en.wikipedia.org/w/api.php?")
    maxDepth = 15
    dataDir = "data"

    #Create or load training set
    arffh = ArffHandler()
    arffFileCreated = generateTraining(enApi, "training", arffh)
    [featureNames, featureVector] = arffh.readArffFile(arffFileCreated)
    
    nb = NB()
    nb.trainClassifier(featureVector)

    initialCategories = [["Category:Medicine"]]

    for category in initialCategories:
        simpleApi.getAllSubCategories(category, maxDepth, nb, arffh)

    simpleCategories = simpleApi.getVisitedCategories()
    simplePageTitles = []
   
    logging.info("Total number of simple categories used: %d", len(simpleCategories))
    for category in simpleCategories:
        simplePageTitles += simpleApi.getAllPagesBelongingToACategory(category)

    equals = 0
    total = 0
    for page in simplePageTitles:
        print page

        pageFileName = re.sub(" ","_",page)
        simpleFile = open(dataDir + "/" + pageFileName + ".simple", "w")
        enFile = open(dataDir + "/" + pageFileName + ".en", "w")

        simpleContent = simpleApi.getPageContent(page, bagOfWords=False)
        enContent = enApi.getPageContent(page, bagOfWords=False)
        
        if simpleContent and enContent:

            for w in enContent:
                enFile.write("%s" % w)
            for w in simpleContent:
                simpleFile.write("%s" % w)

            #print "simpleContent:", simpleContent
            #print "enContent:", enContent
            print "Equal?", simpleContent == enContent
            if simpleContent == enContent:
                equals += 1
            total += 1

    print ("Number of equals = %d . Total number = %d" % (equals, total))
def main():

    #Simple argument consistency check
    #Exit if arguments not specified
    if len(sys.argv) != 3:
        print("usage: python3 nbclassify.py MODELFILE TESTFILE")
        sys.exit(0)

    #Create an instance of Naive Bayes class in classification mode
    naive_bayes = NB('CLASSIFY')
    #Load the model file into the class
    naive_bayes.load(sys.argv[1])
    #Get the predictions from the classifier
    predictions = naive_bayes.classify(sys.argv[2])

    #print out the predictions to stdout
    for prediction in predictions:
        print(prediction)
예제 #6
0
    def create_architecture(self, offsets=None, mode='add', in_proba=False):
        ## generate abiotic and biotic covariates
        self.gen_covs()

        ## Abiotic response
        x_abio = self.fe_env_net(self.env_feat)

        ## Here, replace following dense layer by another network that yields regression parameters given traits
        if in_proba:
            print('HSM proba given')
            abio_resp = x_abio
        else:
            abio_resp = tfkl.Dense(
                self.m, use_bias=self.hsm_config['archi']['fit_bias'])(x_abio)

        ## Generate biotic response
        if self.var_assoc:
            x_bio = tf.expand_dims(self.fe_bio_net(self.bio_feat), axis=1)
        else:
            x_bio = tfkl.Lambda(lambda x: tf.ones((tf.shape(x)[0], 1, self.d)),
                                name=self.model_name + '_loadings')(abio_resp)

        assoc = self.association(x_bio)
        bio_resp = tfkl.Dot(axes=1)([self.counts, assoc])

        ### Aggregate abiotic and biotic effects
        drivers = [abio_resp, bio_resp]
        # if self.im_config['archi']['fit_bias']=='offset':
        #     drivers+=[tf.expand_dims(tf.constant(offsets,dtype=tf.float32),0)]

        ### aggregation is done using addition here, could be extended to more complex differentiable functions
        if (self.dist[0] == 'binomial') & (mode == 'bam'):
            ## Add an intercept (useful for basal species)
            off_bio_resp = BiasLayer(name=self.model_name +
                                     '_indep_offset')(bio_resp)
            pred = tfkl.Activation('sigmoid')(abio_resp) * tfkl.Activation(
                'sigmoid')(off_bio_resp)
            self.eta_model = tfk.Model(self.inputs, [abio_resp, off_bio_resp])
        else:
            logits = tfkl.Add(name=self.model_name + '_out')(drivers)

            if self.dist[0] in ['negbin']:
                self.disp = GlobalParam(self.m, 'disp')
                logits = self.disp(logits)
                self.nbr = NB(theta_var=self.disp.kernel)

            pred = tfkl.Activation(act_fn.get(self.dist[0]))(logits)
            self.eta_model = tfk.Model(self.inputs, logits)

        self.pred_model = tfk.Model(self.inputs, pred)
        self.hsm_model = tfk.Model(self.env_in, abio_resp)

        if self.var_assoc:
            self.assoc_model = tfk.Model(self.bio_in, assoc)
def main():

    #Simple argument consistency check
    #Exit if arguments not specified
    if len(sys.argv) != 3:
        print("usage: python3 nblearn.py TRAININGFILE MODELFILE")
        sys.exit(0)

    #Create an instance of Naive Bayes class in training mode
    naive_bayes = NB('TRAIN')
    #Fit the data with the training file
    naive_bayes.fit(sys.argv[1])
    #Train the classifier
    naive_bayes.train()
    #Output a model file to the file specified
    naive_bayes.generate_model(sys.argv[2])
예제 #8
0
def main():
    train_filename = sys.argv[1]
    # This command is to cross validate the given training data set
    if sys.argv[2] == 'cross_validate':
        # cross validate the given training data set with kNN algorithm
        if sys.argv[3][1] == 'N':
            k_value = int(sys.argv[3][0])
            knn = KNN()
            print(knn.cross_validation(k_value, train_filename))
        # cross validate the given training data set with NB algorithm
        else:
            nb = NB()
            print(nb.cross_validation(train_filename))

    # This command will generate a 10‐fold stratified data with given train dataset in required format
    # The required format is described in assignment documentation
    elif sys.argv[2] == 'write_stratification':
        knn = KNN()
        knn.writeStratification(train_filename, "10_folds_stratified.csv")
    else:
        test_filename = sys.argv[2]
        if sys.argv[3][1] == 'N':
            k_value = int(sys.argv[3][0])
            knn = KNN()
            train_dataset = knn.getData(train_filename)
            test_data = knn.getData(test_filename)
            train_data, label_data = knn.splitAttributeLabels(train_dataset)
            results = knn.knn_predict(train_data, label_data, test_data,
                                      k_value)
            for i in results:
                print(i)
        elif sys.argv[3] == 'NB':
            nb = NB()
            train_data = nb.getTrainData(train_filename)
            test__data = nb.getTestData(test_filename)
            result = nb.getPredictions(test__data, train_data)
            for i in range(len(result)):
                if result[i][-1] == 1:
                    print('yes')
                else:
                    print('no')
예제 #9
0
zr = ZeroR()
zr.train(t, rows)
print("\ndiabetes")
zr.dump()

# NB

print("\n\n#--- Nbok ---------------------")
tbl = Tbl("weathernon.csv")
rows = []
for lst in tbl.fromString(False, "file"):
    rows.append(lst)
c = Col()
tbl.cols = c.colNum(tbl.rows)
nb = NB(tbl, 3)
nb.train(tbl, rows)
print("\nweathernon")
nb.dump()

print()
tbl = Tbl("diabetes.csv")
rows = []
for lst in tbl.fromString(False, "file"):
    rows.append(lst)
c = Col()

tbl.cols = c.colNum(tbl.rows)
nb = NB(tbl, 19)
nb.train(tbl, rows)
print("\ndiabetes")
예제 #10
0
from NB import NB
from scipy.io import arff
import pandas as pd
import sys
import random
import numpy as np


# read input
def readInput(file):
    dataset, meta = arff.loadarff(file)
    data = pd.DataFrame(dataset)
    # decode all attributes in dataframe
    for att in meta._attrnames:
        data[att] = data[att].str.decode('utf-8')
    return data, meta


# define cases
trainFile = sys.argv[1]
testFile = sys.argv[2]
model = sys.argv[3]

trainData, meta = readInput(trainFile)
testData, meta = readInput(testFile)

if model == "n":
    accuracy = NB(trainData, testData, meta)
elif model == "t":
    accuracy = TAN(trainData, testData, meta)
예제 #11
0
 def __init__(self):
     NB.__init__(self)
예제 #12
0
파일: DB.py 프로젝트: RP7/R7-OCM
	def __init__(self):
		NB.__init__(self)
예제 #13
0
                                 'sci.med', 'sci.space', 'talk.politics.misc'
                             ],
                             columns=[
                                 'comp.graphics', 'rec.sport.hockey',
                                 'sci.med', 'sci.space', 'talk.politics.misc'
                             ])
        plt.figure(figsize=(10, 7))
        ax = seaborn.heatmap(df_cm, annot=True)
        ax.set(xlabel='Ground Truth', ylabel='Predicted')
        plt.title("TF-IDF/" + str(j) + "/" + str(i))
        plt.savefig("plots/TF_IDF_" + str(j) + "_" + str(i) + ".png")

        # plt.show()
        #
        #
        model = NB()
        model.fit(trainx, trainy)
        result = model.predict(testx)
        acr = accuracy(result, testy)
        #
        f = open('accuracy.txt', mode='a')
        #
        f.write("NA_NA_" + str(j) + "_" + str(i) + " " + str(acr) + "\n")
        f.close()
        conf = confusion_matrix(testy,
                                result,
                                labels=[
                                    'comp.graphics', 'rec.sport.hockey',
                                    'sci.med', 'sci.space',
                                    'talk.politics.misc'
                                ])
예제 #14
0
파일: vinla.py 프로젝트: Lirunhua/Jarce
            print('parents of each nodes(attributes):\n', TAN_data.parents, '\n')

    print('\n所有平均交叉结果:\n', round(sum(loss01)/len(loss01), 3))

if Global_V.SCHEME.upper() == 'KDB':
    pass
if Global_V.SCHEME.upper() == 'AODE':
    pass
if Global_V.SCHEME.upper() == 'NB':

    result = []
    fold_count = 0
    for CrossTest_count in range(0, data_len, data_len // 10):  # 交叉验证
        p_c_fold = []
        loss01_fold = 0
        NB_data = NB(data_initial, 0.9, CrossTest_count)
        NB_data.train()
        p_c_fold, result_fold = NB_data.classify()
        result.append(result_fold)
        # print('\n第', fold_count, '次交叉结果:\n')
        print('\n第', fold_count, '次交叉')
        loss01_fold = estimate_Output(NB_data.testData, p_c_fold, result_fold, 1)
        loss01.append(loss01_fold)
        fold_count += 1
        # 每个分类的过程,必须保存:1. 每一个testdata里的instance分类到每个Ci的概率p_y    2. 分类结果:result
        # output(result, TAN_car.testData,1)    # output 函数的输入为:
        # output的输出,应该要有:1.每个testdata分到每个类的概率  2.分类结果result  3.testData,(以及参数1,2,3;不同的参数,输出不同)
        # compare the prediction results with the real results
        if Global_V.PRINTPAR == 3:
            print('testset:\n', NB_data.testData, '\n')
            print('p_ci= \n', p_c_fold, '\n', 'result:\n', result)