def kFoldCrossValidation(k,dataset): x = dataset.getx(); y = dataset.gety(); dim = dataset.getDimension(); subDataDim=dim//k; #mix the dataset randomx=x; randomy=y; randomList = random.sample(range(0, dim), dim); for i in range(0,dim): randomx[i]=x[randomList[i]]; randomy[i]=y[randomList[i]]; x=randomx; y=randomy; for i in range(0,k): if i==k-1: xtest = x[i * subDataDim:]; ytest = y[i * subDataDim:]; xtrain = x[0:i * subDataDim]; ytrain = y[0:i * subDataDim]; else: xtest = x[i * subDataDim:i * subDataDim + subDataDim]; ytest = y[i * subDataDim:i * subDataDim + subDataDim]; if i==0: xtrain=x[i * subDataDim + subDataDim:]; ytrain=y[i * subDataDim + subDataDim:]; else: xtrain = x[:i * subDataDim]; xtrain=np.concatenate((xtrain,x[i * subDataDim + subDataDim:])) ytrain = y[:i * subDataDim]; ytrain=np.append(ytrain,y[i * subDataDim + subDataDim:]) trainingSet = d.DataSet(xtrain, ytrain, len(ytrain), dataset.getFeaturesNumber()); testSet = d.DataSet(xtest, ytest, len(ytest), dataset.getFeaturesNumber()); test(trainingSet,testSet);
def main(): fold = 10 data_set = dataFile('wine/wine.data', 0).Open() cv_data = CV.CrossValidation(data_set, fold) acc = [] res = 0 for i in range(5): for i in range(0, fold): training_data, test_data = cv_data.findTest(i) training_data = DataSet(training_data) test_data = DataSet(test_data) size_of_test_data = test_data.getNumberOfRow() X_train, X_test, y_train, y_test = training_data.getData( ), test_data.getData(), training_data.getLabels( ), test_data.getLabels() classifier = RandomForestClassifier(n_estimators=10, criterion='entropy') classifier.fit(X_train, y_train) y_predict = classifier.predict(X_test) accuracy = accuracy_score(y_test, y_predict, normalize=False) / size_of_test_data acc.append(accuracy) print(1 - cv_data.Accuracy(acc)) res += 1 - cv_data.Accuracy(acc) print(res / 5)
def holdoutCrossValidation(dataset, scale=True): x=dataset.getx(); y=dataset.gety(); dim=dataset.getDimension(); #70% train - 30% test traindim=dim//100*70; #create a random list of values to decide the examples that will be the train randomList=random.sample(range(0, dim), traindim); #sort reverse the list in order to delete the values form the originale dataset without errors randomList.sort(reverse=True); xtrain=np.empty(shape=(traindim,dataset.getFeaturesNumber())); ytrain=np.empty(traindim); #delete selected random values from x and put it into xtrain, same things for y for i in range(0, traindim): xtrain[i]=x[randomList[i]]; x = np.delete(x, randomList[i],0); ytrain[i]=y[randomList[i]]; y = np.delete(y, randomList[i]); xtest = x; ytest = y; trainingSet=d.DataSet(xtrain,ytrain,traindim,dataset.getFeaturesNumber()); testSet=d.DataSet(xtest,ytest,dim-traindim,dataset.getFeaturesNumber()); print("NOT MINMAXSCALED"); test(trainingSet,testSet); if scale==True: print("MINMAXSCALED"); trainingSet.minmaxScale(); testSet.minmaxScale(); test(trainingSet,testSet);
def test_log_reg(type='batch'): trn = ds.DataSet("./Data/usps_train.csv", set_type='log_reg', delim=',', y_col=256) tst = ds.DataSet("./Data/usps_test.csv", set_type='log_reg', delim=',', y_col=256) if type == 'batch': w = mla.get_data_mwlogb(trn.data_mx, trn.data_my) elif type == 'online': w = mla.get_data_mwlogo(trn.data_mx, trn.data_my) trn_g = mla.get_data_mglog(w, trn.data_mx) trn_sse = ds.get_sse(trn_g, trn.data_my) trn_ase = ds.get_ase(trn_g, trn.data_my) tst_g = mla.get_data_mglog(w, tst.data_mx) tst_sse = ds.get_sse(tst_g, tst.data_my) tst_ase = ds.get_ase(tst_g, tst.data_my) #print "W: ", w #print "Training G: ", trn_g #print "Testing G: ", tst_g print "Logistic Regression: " print "Training SSE: ", trn_sse print "Testing SSE: ", tst_sse print "Training ASE: ", trn_ase print "Testing ASE: ", tst_ase, "\n"
def testDaily2Weekly(self): date = [] open = [] high = [] low = [] close = [] for i in xrange(20021202, 20021205): date.append(i) open.append(10.0) high.append(10.2) low.append(9.8) close.append(9.9) self.assertEquals(open[0], 10.0) dailyDataSet = DataSet.DataSet() dailyDataSet.register('date', date) dailyDataSet.register('open', open) self.assertEquals(dailyDataSet.get('open')[0], 10.0) dailyDataSet.register('high', high) dailyDataSet.register('low', low) dailyDataSet.register('close', close) newds = DataSet.DataSet() dates_from_daily_to_weekly(dailyDataSet, newds) self.assertEquals('date' in newds.get_headers(), 1) translatedDates = newds.get('date') self.assertEquals(len(translatedDates), 1) self.assertEquals(translatedDates[0], 20021202) self.assertEquals(dailyDataSet.get('open')[0], 10.0) self.assertEquals(dailyDataSet.get('close')[0], 9.9)
def test_per(type='batch'): trn = ds.DataSet("./Data/usps_train.csv", set_type='per', delim=',', y_col=256) tst = ds.DataSet("./Data/usps_test.csv", set_type='per', delim=',', y_col=256) if type == 'batch': w = mla.get_data_mwpb(trn.data_mx, trn.data_my) elif type == 'online': w = mla.get_data_mwpo(trn.data_mx, trn.data_my) elif type == 'voted': w = mla.get_data_mwpv(trn.data_mx, trn.data_my) trn_g = mla.get_data_mgp(w, trn.data_mx) trn_sse = ds.get_sse(trn_g, trn.data_my) trn_ase = ds.get_ase(trn_g, trn.data_my) tst_g = mla.get_data_mgp(w, tst.data_mx) tst_sse = ds.get_sse(tst_g, tst.data_my) tst_ase = ds.get_ase(tst_g, tst.data_my) # print "WS: ", np.shape(w) # print "XS: ", np.shape(trn.data_mx) # print "YS: ", np.shape(trn.data_my) print type.upper(), " Perceptron: " print "Training Mistakes: ", trn_sse print "Testing Mistakes:: ", tst_sse print "Training Mistake Percent: ", trn_ase print "Testing Mistake Percent: ", tst_ase, "\n"
def read_img_sets(image_dir, image_size, validation_size=0): class DataSets: pass data_sets = DataSets() images, labels, ids, cls, cls_map = load_data(image_dir, image_size) if isinstance(validation_size, float): validation_size = int(validation_size * images.shape[0]) test_images = images[:validation_size] test_labels = labels[:validation_size] test_ids = ids[:validation_size] test_cls = cls[:validation_size] train_images = images[validation_size:] train_labels = labels[validation_size:] train_ids = ids[validation_size:] train_cls = cls[validation_size:] data_sets.train = DataSet.DataSet(train_images, train_labels, train_ids, train_cls) data_sets.test = DataSet.DataSet(test_images, test_labels, test_ids, test_cls) return data_sets, cls_map
def make_dates_and_dataset(self, dates): date = [] for x in dates: date.append(x) dailyDataSet = DataSet.DataSet() dailyDataSet.register('date', date) newds = DataSet.DataSet() dates_from_daily_to_weekly(dailyDataSet, newds) return dates, dailyDataSet, newds
def test_knn(): trn = ds.DataSet("./Data/knn_train.csv", set_type='knn', delim=',', y_col=0) tst = ds.DataSet("./Data/knn_test.csv", set_type='knn', delim=',', y_col=0) trn_ks = mla.get_best_k_cv(trn.data_mx, trn.data_my, 51) tst_ks = mla.get_best_k_cv(trn.data_mx, trn.data_my, 51) print trn_ks
def divideDataSet(file, attrnames, target, values): #the dataset is divided into train (90%) and test (10%) and the two dataset objects are created trainsize = (file_len(file) * 90) / 100 #90% train 10% test allExamples = list() for line in open(file).readlines(): content = line.split(',') if file in datasetWithID: #only these datasets have the attribute id content.pop( 0 ) #remove the first element, I guess I would do tests on the attribute id content = [c.rstrip() for c in content] allExamples.append(content) train = random.sample( allExamples, trainsize) #take 90% of the examples in dataset for the train dsetC = list(allExamples) #copy the list to work without modifying it trainC = list(train) #copy the list to work without modifying it inCommon = [val for val in dsetC if val in trainC] #find common elements for i in range(len(inCommon)): #remove elements in common from both dsetC.remove(inCommon[i]) if inCommon[i] in trainC: trainC.remove(inCommon[i]) test = dsetC + trainC #join to get exactly the remaining 10% of the examples for the test #creates dataset structure for the Train examples = [] for i in range(len(train)): example = [] for j in range(len(train[0])): example.append(train[i][j]) examples.append(example) attributes = [] for i in range(0, len(example)): attributes.append(i) inputs = removeTarget(attributes, target) DataTrain = DataSet.DataSet(examples, inputs, attributes, target, attrnames, values) #crates dataset for use #creates dataset structure for the Test examples = [] for i in range(len(test)): example = [] for j in range(len(test[0])): example.append(test[i][j]) examples.append(example) attributes = [] for i in range(0, len(example)): attributes.append(i) inputs = removeTarget(attributes, target) DataTest = DataSet.DataSet(examples, inputs, attributes, target, attrnames, values) #crates dataset for use return DataTest, DataTrain
def __init__(self, name, start_bar=-1, numbars=-1, dataset=None, filename=None): self.filename = filename self.subchart = {} # key is the title of the subchart. Value is the # subchart object. self.subchart_order = [] # list of subchart titles in order of display self.subchart_coords = [{}] # store y and height values self.start_bar = start_bar self.numbars = numbars self.name = name self.drawtrendline = 0 # assume daily data if dataset == None: self.daily_dataset = DataSet.DataSet() else: self.daily_dataset = dataset self.weekly_dataset = DataSet.DataSet() self.monthly_dataset = DataSet.DataSet() self.vertical_border = 15 self.horizontal_border = 5 self.yscale_space = 40 # used in labeling the y axis self.xscale_space = 30 # used in labeling the x axis self.xscale = -1 # xscale is calculated once at the chart level self.prev_values = {} # used by the draw method to only draw # when needed self.prev_values['x'] = -1 self.prev_values['y'] = -1 self.prev_values['width'] = -1 self.prev_values['height'] = -1 self.prev_values['numbars'] = -1 self.prev_values['start_bar'] = -1 self.scale = SCALE_DAILY # default self.able_to_draw = 0 # disable drawing. # Used when chart is first created. self.x = -1 self.y = -1 self.width = -1 self.height = -1 self.calc_next_draw = 1 # ensure that all datasets have a date series. TranslateDate.dates_from_daily_to_weekly(self.get_daily_dataset(), self.get_weekly_dataset()) TranslateDate.dates_from_daily_to_monthly(self.get_daily_dataset(), self.get_monthly_dataset())
def getImageNumber(self, targetLabel): if self.data_set == 'mnist': datasetTest = DataSet('mnist', 'test') if self.data_set == 'cifar10': datasetTest = DataSet('cifar10', 'test') test_x, test_y = datasetTest.get_dataset() myList = [] for i in range(0, 10000): label = numpy.where(test_y[i] > 0)[0][0] strLabel = self.get_label(int(label)) if str(targetLabel) == str(strLabel): myList.append(i) print(myList) raw_input()
def select_tuple(tuples, k): r"""Select the top-k confidence tuples into dataset_list Args: tuples (list): A list of all instances of Tuple Class, which contains all tuples in whole csv file. k (int): A argument for selecting top-k confidence tuples. Return: dataset_list (DataSet): A list of DataSet instances, which be used as training set. """ for i in range(k): print(tuples[i].cid, tuples[i].value_dict) # true_str = input("Which tuples violate the CFDs you want to express? " # "(Please input the cid of tuples, e.g. 2,3,4,5) >>> ") # false_str = input("Which tuple don't violate the CFDs you want to express? " # "(Please input the cid of tuples, e.g. 2,3,4,5) >>> ") true_str = '0,1,7' false_str = '2,3,4,5,6' mark_label(true_str, false_str, tuples) dataset_list = list() for i in range(k): dataset = DataSet() dataset.cid = tuples[i].cid dataset.feature_vec = tuples[i].feature_vec dataset.label = tuples[i].label dataset_list.append(dataset) return dataset_list
def add_source(self): filename = tkFileDialog.askopenfilename() if filename: dataset = DataSet() dataset.readFromFile(filename) self.datasets.append(dataset) self.sourcelist.insert(END, str(dataset))
def __init__(self, DataFileName, IndependentVariablesList, DependentVariablesList): self.Data = DataSet(DataFileName) self.TrainingData = self.Data.TrainingData() self.TestingData = self.Data.TestingData() self.DependentVariablesList = DependentVariablesList self.IndependentVariablesList = IndependentVariablesList self.CalculateAICs()
class OLSUnitTests(unittest.TestCase): Data = DataSet('TestData.csv') OLSTest = OLSRegression('TestData.csv', 'y', ['x', 'x2']) def test_FitSize(self): NewOLS = OLSRegression('TestData.csv', 'x', ['x']) Actual = len(NewOLS.ModelFit()) Expected = len(NewOLS.Data.TestingData()['x']) self.assertEqual(Actual, Expected) def test_DataSetWithParameters(self): TrainingDataLenght = len(self.Data.TrainingData()) Expected = numpy.round(len(self.Data.AllData().index) * 0.8) self.assertEqual(TrainingDataLenght, Expected) def test_TestingSet(self): Actual = len(self.Data.TestingData()) Expected = numpy.round(len(self.Data.AllData().index) * 0.2) self.assertEqual(Actual, Expected) def test_SetDependentVariableList(self): Actual = self.OLSTest.DependentVariablesList Expected = ['x', 'x2'] self.assertEqual(Actual, Expected) def test_RegressionCoefficient(self): NewOLS = OLSRegression('TestData.csv', 'y', ['x']) Actual = numpy.round(NewOLS.Regression.coef_) self.assertEqual(Actual, 2) def test_AIC(self): NewOLS = OLSRegression('TestData.csv', 'x', ['x']) Actual = numpy.round(NewOLS.AIC()) self.assertEqual(Actual, -94)
def shuffleDataTrain(dataset): #given the train test, the order of the examples is randomized and a subset is taken l = len(dataset.examples) sds = random.sample( dataset.examples, l ) #sample function takes n random examples from the dataset. Set n = number of examples to simulate a total randamization for i in range( 0, l / 2 ): #della meta' degli esempi ne prendo un sottoinsime per cercare di creare alberi un po' diversi tra loro a = random.randint(0, 4) if a == 1: sds.pop(i) #create examples, attributes and inputs as before... examples = [] for i in range(len(sds)): example = [] for j in range(len(sds[0])): example.append(sds[i][j]) examples.append(example) attributes = [] for i in range(0, len(example)): attributes.append(i) inputs = removeTarget(attributes, dataset.target) return DataSet.DataSet(examples, inputs, attributes, dataset.target, dataset.attrnames, dataset.values) #crates dataset for use
def __InterpretTxtFile(self, filePath, hasFeatureLine, seperator, deleteBadData): try: fptr = open(filePath, 'r') except Exception: Errors.ShowWarningMsgBox(self, Exception.message) lines = fptr.readlines() lineCount = len(lines) newLines = list() for i in range(lineCount): newLineArray = lines[i].split(" ") if not (seperator == ""): newLineArray = lines[i].split(seperator) if deleteBadData: if newLineArray.count(" ") is not 0 or newLineArray.count("") is not 0: continue newLine = ",".join(newLineArray) newLines.append(newLine) for tmpLine in newLines: if tmpLine.count("\n") > 0: tmp = tmpLine.replace("\n","") index = newLines.index(tmpLine) newLines[index] = tmp ourDataSet = DataSet.DataSet(newLines, hasFeatureLine) fptr.close() return ourDataSet
def openDataSet(filename, v): data = DS.DataSet("./Data/" + filename) #Number of input units, one for each element in input entry n = data.getNumInputElem() inputs = data.getInputs(test=v) targets = data.getTargets() return n, inputs, targets
def _readOrLoadDataset(self, ds_type, reference_dataset=None): fname = "cache/%s_data.pickle" % ds_type try: if not params.USE_DATA_CACHE: raise ("Do not use cache") f = open(fname, 'rb') ds = pickle.load(f) if params.DEBUG: print "Using cached %s..." % fname except: if params.DEBUG: print "Reading and dumping %s..." % fname data_fname = "data/%s.csv" % ds_type ds = DataSet.DataSet(ds_type == 'train') ds.importData(data_fname) if reference_dataset is not None: ds.dropUselessFeatures(reference_dataset.getUselessFeatures()) ds.addNanFeatures(reference_dataset.getNanColumns()) if params.LOG_TRANSFORM: ds.logTransformQuantitativeFeatures() if params.STANDARDIZE_DATA: ds.standardizeQuantitativeFeatures( means=(reference_dataset.getQuantitativeFeatureMeans() if reference_dataset is not None else None), variances=( reference_dataset.getQuantitativeFeatureVariances() if reference_dataset is not None else None)) pickle.dump(ds, open(fname, 'w')) return ds
def kFoldCrossValidation(k, dataset): perceptronAccuracies = [] votedPerceptronAccuracies = [] x = dataset.x y = dataset.y size = dataset.size subDataSize = size / k # shuffle dataset randomx = copy.deepcopy(x) randomy = copy.deepcopy(y) randomList = random.sample(xrange(size), size) for i in xrange(0, size): randomx[i] = x[randomList[i]] randomy[i] = y[randomList[i]] x = randomx y = randomy for i in xrange(0, k): if i == k - 1: xtest = x[i * subDataSize:] ytest = y[i * subDataSize:] xtrain = x[0:i * subDataSize] ytrain = y[0:i * subDataSize] else: xtest = x[i * subDataSize:i * subDataSize + subDataSize] ytest = y[i * subDataSize:i * subDataSize + subDataSize] if i == 0: xtrain=x[i * subDataSize + subDataSize:] ytrain=y[i * subDataSize + subDataSize:] else: xtrain = x[:i * subDataSize] xtrain = np.concatenate((xtrain, x[i * subDataSize + subDataSize:])) ytrain = y[:i * subDataSize] ytrain = np.append(ytrain, y[i * subDataSize + subDataSize:]) trainingSet = ds.DataSet(xtrain, ytrain, len(ytrain), dataset.numAttributes) testSet = ds.DataSet(xtest, ytest, len(ytest), dataset.numAttributes) perceptronAccuracy, votedPerceptronAccuracy = test(trainingSet, testSet) perceptronAccuracies.append(perceptronAccuracy) votedPerceptronAccuracies.append(votedPerceptronAccuracy) avgPerceptronAccuracy = round(sum(perceptronAccuracies) / k, 2) avgVotedPerceptronAccuracy = round(sum(votedPerceptronAccuracies) / k, 2) print("") print("Perceptron average accuracy: {}%.".format(avgPerceptronAccuracy)) print("Voted perceptron average accuracy: {}%.".format(avgVotedPerceptronAccuracy)) print("") print("")
def load_patterns(): # load pattern data dataSet = ds.DataSet( '/home/adriano/Projects/ANNDispersionRelation/ann_training/2d/square/te/tests_new_db/16_interpolated_points/' ) dataSet.read_csv_file('dr_te_pc_dataset.csv') #print(len(dataSet.all_patterns[192:,:])) return dataSet
def load_patterns(): # load pattern data dataSet = ds.DataSet( '/home/adriano/Projects/ANNDispersionRelation/ann_training/3d/fcc/diamond2/no_material/16_interpolated_points/' ) dataSet.read_csv_file('dr_diamond_pc_dataset.csv') #print(len(dataSet.all_patterns[192:,:])) return dataSet
def trainHMM(self, filename): print "Reading training data from %s" % (filename) # Read in the training data from the file dataset = DataSet(filename) states, obs = dataset.read_file() # Instatiate and train the HMM self.hmm, ll = train_model(dataset, 1e-5)
def load_chart(file_name): d = load_file(file_name) if d != None: ds = DataSet() for x in d.keys(): ds.register(x, d[x]) name = os.path.basename(file_name) chart = create_standard_chart(name, ds) chart.set_file_name(file_name) chart.can_draw(1) current_chart.add(name, chart)
def setup_loaded_chart(d, file_name): ds = DataSet() for x in d.keys(): ds.register(x, d[x]) name = os.path.basename(file_name) chart = create_default_chart(name, ds) chart.set_file_name(file_name) current_chart.add(name, chart) current_chart.set_current(name) chart.can_draw(1) schedule_redraw()
def testing(fileDataset, number): length = len(fileDataset.examples) / 10 k = number * length validation = [] i = 0 while i < length: validation.append(fileDataset.examples[i + k]) i = i + 1 return DataSet.DataSet(validation, fileDataset.inputs, fileDataset.attributes, fileDataset.target, fileDataset.attrnames, fileDataset.values)
def estMaxSequence(self, filename): print("Reading testing data from %s" % (filename)) # Read in the testing dta from the file self.dataset = DataSet(filename) self.dataset.readFile(200, "test") # Run Viterbi to estimate most likely sequence viterbi = Viterbi(self.hmm) self.maxSequence = viterbi.mostLikelySequence(self.dataset.testOutput)
def train_network_QNN(self): # Train an mnist model. if self.data_set == 'mnist': datasetTrain = DataSet('mnist', 'training') datasetTest = DataSet('mnist', 'test') if self.data_set == 'cifar10': datasetTrain = DataSet('cifar10', 'training') datasetTest = DataSet('cifar10', 'test') if self.data_set == 'fashion': datasetTrain = DataSet('fashion', 'training') datasetTest = DataSet('fashion', 'test') train_x, train_y = datasetTrain.get_dataset() test_x, test_y = datasetTest.get_dataset() needToTrain, myModel = func.getModelFromQNN(self.cf, train_x, train_y, test_x, test_y) # myModel=func.getModelFromDeepGame(cf, train_x,train_y,test_x,test_y,epochs,batch_size) self.model = myModel score = (self.model).evaluate(test_x, test_y, verbose=0) print( "Precision " + str(self.abits) + " " + str(self.wbits) + " Test loss:", score[0]) print( "Precision " + str(self.abits) + " " + str(self.wbits) + " Test accuracy:", score[1])
def estimate(self): if self.twords > 0 : da = DataSet() da.read_wordmap2(self.dir + self.wordmapfile,self.id2word) print("Sampling ",self.niters," iterations!\n") # 申请 TPTM self.tp = TPTM(10,self.K,100,10000,self.nw,self.ut) # 这里设置默认参数,后期封装到外层 self.tp.preprocessing() self.alpha_c = self.tp.Get_alpha_c() self.liter = self.tp.iteration last_iter = self.liter for self.liter in range(last_iter+1,self.niters+last_iter) : print("Iteration ",self.liter," ...\n") if self.liter != 0 and (self.liter%2)!=0: # 奇数更新 lambda 值 self.tp.update_lambda_s(self.liter) self.alpha_c = self.tp.Get_alpha_c() elif self.liter != 0 and (self.liter%2)==0: # 偶数更新 x_u_c_t 值 self.tp.update_x_u_c_t(self.liter) self.alpha_c = self.tp.Get_alpha_c() elif self.liter != 0 and self.liter%50 == 0: self.tp.update_Mpre_c(self.liter) self.tp.update_Mpre_s(self.liter) self.alpha_c = self.tp.Get_alpha_c() # for all z_i for m in range(self.M) : for n in range(self.ptrndata.docs[m].length) : # (z_i) = z[m][n] # sample from p(z_i|z_-i,w) topic = self.sampling(m,n) self.z[m][n] = topic if self.savestep > 0 : if self.liter % self.savestep == 0 : # saving the model print("Saving the model at iteration ",self.liter," ...\n") self.compute_theta() self.compute_phi() u = Utils() self.save_model(u.generate_model_name(self.liter)) print("Gibbs sampling completed!\n") print("Saving the final model!\n") self.compute_theta() self.compute_phi() self.liter -= 1 u = Utils() self.save_model(u.generate_model_name(-1))