def test_single_variate_single_dimension(self): # this is a single-variable expressed as a 1-d numpy array # (each element of the array is an instance) X = np.array([1, 2, 3, 4, 5]) B = 2 s = np.mean bootstrap = Bootstrap.Bootstrap(X, s, B) bootstrap.run() assert bootstrap.N == 5 assert bootstrap.B == B
def test_07a(self): X = my_data.get_data() #print(X) s = self.ratio_first_eigenvector_to_sum B = 200 #B = 10 # explore the empirical data... covariance_matrix = np.cov(X, bias=True, rowvar=False) w, v = LA.eig(covariance_matrix) v = np.transpose(v) print("--- empirical data - shape ---") print(X.shape) print("--- empirical data - covariance matrix ---") print(covariance_matrix) print("--- empirical data - eigenvalues ---") print(w) print("--- empirical data - eigenvectors ---") print(v) # prepare to collect data - empty 3-d array num_attributes = X.shape[1] self.eigenvectors = np.empty([0, num_attributes, num_attributes]) # run the bootstrap print("--- run the bootstrap ---") bootstrap = Bootstrap.Bootstrap(X, s, B) bootstrap.add_callback(self.my_callback) [std, sem] = bootstrap.run() print("standard deviation:") print(std) print("standard error of the mean") print(sem) #assert(False) # investigate the results # plot the theta_stars (the measure) from the bootstrap replications # the expectation is this is somewhat gaussian (long tails are not acceptable) print("--- results from bootstrap ---") my_charts.plot_histogram(bootstrap.theta_star, "Count of Occurrences", "Ratio: eigenV1/sum(eigen)", "Histogram - Count of EigenV1/sum") #assert(False) # plot the first two principal component vectors using box-and-whisker # we are looking for (lack of) variability print("first two principal components") print(self.eigenvectors.shape) my_charts.plot_box_and_whisker()
def test_treatment(self): treatment = np.array([94, 197, 16, 38, 99, 141, 23]) s = self.my_s B = 100 # look at the original data... print("Treatment sample size: ", treatment.shape[0], " mean: ", np.mean(treatment), "sem: ", stats.sem(treatment)) # run the bootstrap ### this is incorrect - we actually should run boot strap # on the DIFFERENCE btween Treatment and control bootstrap = Bootstrap.Bootstrap(treatment, s, B) [std, sem] = bootstrap.run()
def test_single_variate(self): # this is a single-variable expressed as a 2-d numpy array # (the attribute is column 0, rows are instances) num_instances = 100 num_attributes = 1 X = np.random.randint(5, size=(num_instances, num_attributes)) s = np.mean B = 2 bootstrap = Bootstrap.Bootstrap(X, s, B) bootstrap.run() assert bootstrap.N == num_instances assert bootstrap.B == B
def test_multi_variate(self): # this is a three-variable expressed as a 2-d numpy array # (the attributes are columns, rows are instances) num_instances = 6 num_attributes = 3 X = np.random.randint(5, size=(num_instances, num_attributes)) s = np.mean B = 2 bootstrap = Bootstrap.Bootstrap(X, s, B) bootstrap.run() assert bootstrap.N == num_instances assert bootstrap.B == B
def generate_bootstraps(self): # bootstrapping for n in range(0, self.nTree): b = Bootstrap() b.generate(self.original_dataset) self.bootstraps.append(b) return self.bootstraps
BOOTSTRAP_N = 20 # number of bootstrap samples (YOU CAN PLAY AROUND WITH THIS) DATA_START_INDEX = 1 # account for df's named index column 0 (DON'T CHANGE THIS UNLESS YOUR DATASET NEEDS IT) DO_K_SWEEP = True # switch to do sweep of K values using K means to find optimal K OPTIMAL_K = 3 # Iris dataset has 3 clusters (ground truth), change this for different datasets # import data iris = datasets.load_iris() df = pd.DataFrame(data=np.c_[iris['data']], columns=iris['feature_names']) # prepare data (add index column 'flower') prep = Prepare('flower', len(df)).names_join(df) df = prep['df'] labels = prep['labels'] # generate bootstrap samples bts = Bootstrap(df, BOOTSTRAP_SIZE, BOOTSTRAP_N).get_bootstraps() # determine optimal clustering K kmeans = Bootstrap.kmeans_bootstrap(bts, DO_K_SWEEP, BOOTSTRAP_N, DATA_START_INDEX, OPTIMAL_K, MAX_K) # max k determined above becomes optimal k gmm = RunAlgos(3, BOOTSTRAP_N, DATA_START_INDEX, bts, kmeans).run_GMM() agglomerative = RunAlgos(3, BOOTSTRAP_N, DATA_START_INDEX, bts, kmeans).run_Agglomerative() kmeans_ = RunAlgos(3, BOOTSTRAP_N, DATA_START_INDEX, bts, kmeans).run_KMeans() # consensus clustering cc_init = Consensus(kmeans_, gmm, agglomerative, bts, df, DATA_START_INDEX, 3, labels) mats = cc_init.combine_results()
def main(args): import Bootstrap as app global app app = app.Bootstrap(args) app.exec_()