def knn(x_train, target_train, x_test, k): ''' Classificador Knn. Entradas: x_train (n x m): Conjunto de n amostras com m atributos cada que serao utilizadas para treinar o modelo. target_train (n x 1): classes de x_train. x_test (n2 x m): Conjunto de n2 amostras com m atributos cada que serao classificados utilizando o modelo treinado. k: numero de vizinhos mais proximos que serao utilizados. Retorna: yhat (n2x1): classificacao de x_test. ''' import numpy as np import utils cls = [] n = x_test.shape[0] m = x_test.shape[1] for i in range(0, n): distance = np.sqrt(np.sum(np.power(x_train - x_test[i, :], 2), 1)) ind = np.argsort(distance) cls.append(utils.mode(target_train[ind[0:k]])) return np.reshape(np.array(cls), (-1, 1))
def PluralityLearner(dataset): """A very dumb algorithm: always pick the result that was most popular in the training data. Makes a baseline for comparison.""" most_popular = mode([e[dataset.target] for e in dataset.examples]) def predict(example): """Always return same result: the most popular from the training set.""" return most_popular return predict
def extract_tfdf_sites(genome,tf): sites = tfdf[tfdf['genome_accession'] == genome][tfdf['TF']==tf]['site_sequence'] # convert to list matrix form,remove nans bio_motif_all_lens = [site for site in sites if type(site) is str] modal_length = mode(map(len,bio_motif_all_lens)) bio_motif = filter(lambda x:len(x)==modal_length,bio_motif_all_lens) if len(bio_motif) != len(bio_motif_all_lens): print "removed", len(bio_motif_all_lens) - len(bio_motif),"of", return bio_motif
def calculate_accuracy(self, y): acc = 0 for i in range(0, self.nc): c = utils.mode(y[self.z[i]]) yhat = mat.repmat(c, len(self.z[i]), 1) acc = acc + len(self.z[i]) * utils.accuracy(y[self.z[i]], yhat) acc = acc / self.n return acc
def calculate_accuracy(self, y): acc = 0 for i in range(0, self.k): ind = np.where(self.c == i)[0] c = utils.mode(y[ind]) yhat = mat.repmat(c, ind.size, 1) acc = acc + ind.size * utils.accuracy(y[ind], yhat) acc = acc / self.n return acc
def i3conf_windowing(): return lines("# Windowing", bindsym("Mod1+o", "split h"), bindsym("Mod1+v", "split v"), "floating_modifier Mod1", bindsym("Mod1+t", "floating toggle"), bindsym("Mod1+f", "fullscreen toggle"), bindsym("Mod1+s", "layout stacking"), bindsym("Mod1+w", "layout tabbed"), bindsym("Mod1+e", "layout toggle split"), bindsym("Mod1+p", "focus parent"), bindsym("Mod1+c", "focus child"), mode("resize", bindsym("h", "resize shrink width 10 px or 10 ppt"), bindsym("j", "resize grow height 10 px or 10 ppt"), bindsym("k", "resize shrink height 10 px or 10 ppt"), bindsym("l", "resize grow width 10 px or 10 ppt"), bindsym_mode("Return", "default"), bindsym_mode("Escape", "default"), ""), bindsym_mode("Mod1+r", "resize"), "")
"4.1", "4.2", "5" ]) io_args = parser.parse_args() question = io_args.question if question == "1.1": input_file = "../data/fluTrends.csv" df = pd.read_csv(input_file, header=0) print('Minimum: %.3f' % df.values.min()) print('Maximum: %.3f' % df.values.max()) print('Mean: %.3f' % df.values.mean()) print('Median: %.3f' % np.median(df.values)) import utils print('Mode: %.3f' % utils.mode(df.values)) print('5th percentile: %.3f' % np.percentile(df.values, 5)) print('25th percentile: %.3f' % np.percentile(df.values, 25)) print('50th percentile: %.3f' % np.percentile(df.values, 50)) print('75th percentile: %.3f' % np.percentile(df.values, 75)) print('95th percentile: %.3f' % np.percentile(df.values, 95)) means = df.mean() print('Highest mean is in: %s' % means.idxmax()) print('Lowest mean is in: %s' % means.idxmin()) variances = df.var() print('Highest variance is in: %s' % variances.idxmax()) print('Lowest variance is in: %s' % variances.idxmin()) pass
if question == "1.1": # Load the fluTrends dataset df = pd.read_csv(os.path.join('..', 'data', 'fluTrends.csv')) X = df.values names = df.columns.values #print(X) #print(names) ''' YOUR CODE HERE FOR Q1.1 ''' #1 a = np.zeros(2) x = np.array(X) print("The mean of all dataset is: %.4f" % np.mean(x)) print("The maximum of all dataset is :%.4f" % np.max(x)) print("The minimum of all dataset is :%.4f" % np.min(x)) print("The median of all dataset is :%.4f" % np.median(x)) print("The mode of all dataset is :%.4f" % utils.mode(x)) #2 print("The 5%% quantile across the dataset: %.4f" % np.percentile(x, 5)) print("The 25%% quantile across the dataset: %.4f" % np.percentile(x, 25)) print("The 50%% quantile across the dataset: %.4f" % np.percentile(x, 50)) print("The 75%% quantile across the dataset: %.4f" % np.percentile(x, 75)) print("The 95%% quantile across the dataset: %.4f" % np.percentile(x, 95)) #3 columnMean = np.mean(x, axis=0) columnVar = np.var(x, axis=0) print("The regions with the highest mean:%s" %
def fit(self, X, y): N, D = X.shape max_info_gain = 0 # Get an array with the number of 0's, number of 1's, etc. count = np.bincount(y, minlength=2) # Get the index of the largest value in count. # Thus, y_mode is the mode (most popular value) of y y_mode = np.argmax(count) self.splitSat = y_mode self.splitNot = None self.splitVariable = None self.splitValue = None # If all the labels are the same, no need to split further if np.unique(y).size <= 1: return # Loop over features looking for the best split for d in range(D): for n in range(N): # Choose value to equate to value = X[n, d] # Find most likely class for each split y_sat = utils.mode(y[X[:, d] < value]) y_not = utils.mode(y[X[:, d] >= value]) # Make predictions zero, one = count p = np.zeros(2) p[0] = zero / N p[1] = one / N y_y = y[X[:, d] < value] y_n = y[X[:, d] >= value] N_y = np.size(y_y) N_n = np.size(y_n) zero_y, one_y = np.bincount(y_y, minlength=2) zero_n, one_n = np.bincount(y_n, minlength=2) p_y = np.zeros(2) p_y[0] = zero_y / N_y p_y[1] = one_y / N_y p_n = np.zeros(2) p_n[0] = zero_n / N_n p_n[1] = one_n / N_n # Compute info gain info_gain = entropy( p) - N_y / N * entropy(p_y) - N_n / N * entropy(p_n) # Compare to minimum error so far if info_gain > max_info_gain: max_info_gain = info_gain self.splitVariable = d self.splitValue = value self.splitSat = y_sat self.splitNot = y_not print(self.splitValue)
def fit(self, X, y): """ YOUR CODE HERE """ #Compute INFO GAIN N, D = X.shape # Get an array with the number of 0's, number of 1's, etc. count = np.bincount(y) # Get the index of the largest value in count. # Thus, y_mode is the mode (most popular value) of y y_mode = np.argmax(count) self.splitSat = y_mode self.splitNot = None self.splitVariable = None self.splitValue = None # If all the labels are the same, no need to split further if np.unique(y).size <= 1: return # Loop over features looking for the best split X = np.round(X) max_info_gain = 0 for d in range(D): for n in range(N): # Choose value to equate to value = X[n, d] # Find most likely class for each split y_sat = utils.mode(y[X[:,d] > value]) y_not = utils.mode(y[X[:,d] <= value]) # Make predictions y_pred = y_sat * np.ones(N) y_pred[X[:, d] <= value] = y_not #process n_yes try: n_yes_div_n = np.bincount(y[X[:,d] > value])[1] / N except IndexError: return #process n_no n_no_div_n = 1 - n_yes_div_n # Compute info gain y2 = count / y.size try: y_yes = np.bincount(y[X[:,d] > value]) / y.size y_no = 1 - y_yes except IndexError: y_no = np.bincount(y[X[:,d] <= value]) / y.size y_yes = 1 - y_no info_gain = entropy(y2) - (n_yes_div_n * entropy(y_yes)) - (n_no_div_n * entropy(y_no)) # Compare to max_info_gain so far if info_gain > max_info_gain: max_info_gain = info_gain self.splitVariable = d self.splitValue = value self.splitSat = y_sat self.splitNot = y_not
def fit(self, X, y): N, D = X.shape # Get an array with the number of 0's, number of 1's, etc. count = np.bincount(y) # Get the index of the largest value in count. # Thus, y_mode is the mode (most popular value) of y y_mode = np.argmax(count) self.splitSat = y_mode self.splitNot = None self.splitVariable = None self.splitValue = None # If all the labels are the same, no need to split further if np.unique(y).size <= 1: return # initialize the minimum entropy value Origin_Entropy = entropy(count / N) maxGain = 0 # Loop over features looking for the best split # X = np.round(X) for d in range(D): for n in range(N): # Choose value to equate to value = X[n, d] # Find most likely class for each split y_sat = utils.mode(y[X[:, d] <= value]) y_not = utils.mode(y[X[:, d] > value]) # Make predictions y_pred = y_sat * np.ones(N) y_pred[X[:, d] > value] = y_not # Compute entropy label_a = np.bincount(y[X[:, d] <= value], minlength=2) label_b = np.bincount(y[X[:, d] > value], minlength=2) a = np.sum(label_a) b = np.sum(label_b) if a == 0: entro_value_a = label_a else: entro_value_a = label_a / a if b == 0: entro_value_b = label_b else: entro_value_b = label_b / b gain = Origin_Entropy - (a / N) * entropy(entro_value_a) - ( b / N) * entropy(entro_value_b) # Compare to minimum error so far if gain > maxGain: # This is the lowest error, store this value maxGain = gain self.splitVariable = d self.splitValue = value self.splitSat = y_sat self.splitNot = y_not
def joinBools(self, bools, sprKing): return utils.convertNoneToIdentity(utils.mode(bools), bools[sprKing])
percentlist = [0.05, 0.25, 0.5, 0.75, 0.95] quantileList = [ np.percentile(X, 5), np.percentile(X, 25), np.percentile(X, 50), np.percentile(X, 75), np.percentile(X, 95) ] for x, y in zip(percentlist, quantileList): print('{} quantile :{:.4}'.format(int(x * 100), y)) print('max: {:.4}'.format(np.max(X))) print('min: {:.4}'.format(np.min(X))) print('mean: {:.4}'.format(np.mean(X))) print('median: {:.4}'.format(np.median(X))) print('mode: {:.4}'.format(utils.mode(X))) meanList = [] varianceList = [] for index, col in df.iteritems(): meanList.append(col.values.mean()) varianceList.append(col.values.var()) print('highest mean region: %s' % df.columns[meanList.index(max(meanList))]) print('lowest mean region: %s' % df.columns[meanList.index(min(meanList))]) print('highest variance region: %s' % df.columns[varianceList.index(max(varianceList))]) print('lowest variance region: %s' % df.columns[varianceList.index(min(varianceList))])
def predict(example): """Find the k closest items, and have them vote for the best.""" best = heapq.nsmallest(k, ((dataset.distance(e, example), e) for e in dataset.examples)) return mode(e[dataset.target] for (d, e) in best)
def train(self, x, y, IGMIN=0.05, NMIN=0): self.x = np.array(x) self.y = np.reshape(y, (-1, 1)) self.n = self.y.size self.m = self.x.shape[1] # MAXIMIZAR ATRAVES DO IG!!!!!! # att = reduc_dim.sequential_forward_selection( # self.x, self.y, self.x, self.y, utils.accuracy, 1, # classif_regres.knn, 3)[0] if np.union1d(y, y).size == 1 or self.n < NMIN: self.root = utils.mode(self.y) else: ig = np.zeros(self.m) for att in range(0, self.m): # APENAS PARA ATRIBUTOS NOMINAIS ig[att] = self.ig(self.x, self.y, att, 0) att = np.argmax(ig) values = np.union1d(self.x[:, att], self.x[:, att]) # if isinstance(x[0, att], basestring): # thr = utils.mode(self.x[:, att]) # else: # # TODO: implementar funcao que encontre o limiar otimo # # de separacao e coloca-lo dentro da funcao. # ''' # Os valores dos atributos sao primeiro ordenados; # # O ponto medio entre dois valores consecutivos eh # um possivel ponto de corte e eh avaliado pela # funcao merito; # # O possivel ponto de corte que maximiza a funcao # merito eh escolhido. # ''' # thr = np.mean(self.x[:, att]) # ig = self.ig(self.x, self.y, att, thr) ig = ig[att] if ig < IGMIN: self.root = utils.mode(y) # valor mais comum else: self.root = None self.children = {} for v in values: self.children[v] = DecisionTree(self.nclasses) # true if attribute is discrete (string) if isinstance(self.x[0, att], basestring): self.att = att x = self.x y = self.y del self.x del self.y for v in values: ind = np.core.defchararray.equal(x[:, att], v) self.children[v].train( x[ind, :], y[ind, 0], IGMIN, NMIN) else: # TODO: variaveis continuas pass
min_val = 100 max_val = 0 sum_val = 0 median_val = 0 list_vals = X.ravel() list_vals.sort() if len(list_vals) % 2 == 0: median_val = (list_vals[(len(list_vals)/2)] + list_vals[(len(list_vals)/2)-1])/2 else: median_val = list_vals[(len(list_vals) - 1) / 2] for feature in range(D): mode_val = utils.mode(X[:][:]) for obj in range(N): if X[obj][feature] > max_val: max_val = X[obj][feature] if X[obj][feature] < min_val: min_val = X[obj][feature] sum_val = sum_val + X[obj][feature] print "Minimum value is ", min_val print "Maximum value is ", max_val print "Mean value is ", (sum_val / (N*D)) print "Median value is ", median_val print "Mode value is ", mode_val # part 2: quantiles
check_grad(grads.bar, grads.bar_grad) elif question == "5.1": # Load the fluTrends dataset df = pd.read_csv(os.path.join("..", "data", "fluTrends.csv")) X = df.values names = df.columns.values # YOUR CODE HERE from scipy import stats as sc print(np.min(X)) print(np.max(X)) print(np.mean(X)) print(np.median(X)) print(utils.mode(X)) print() print(np.percentile(X, 5)) print(np.percentile(X, 25)) print(np.percentile(X, 50)) print(np.percentile(X, 75)) print(np.percentile(X, 95)) print() print(df.mean().sort_values()) print() print(df.var().sort_values()) print(df.min()) print() print(df.max()) print() print(df.mean())
io_args = parser.parse_args() question = io_args.question if question == "1.1": # Q1.1 - This should print the answers to Q 1.1 # Load the fluTrends dataset X, names = utils.load_dataset("fluTrends") # part 1: min, max, mean, median and mode print "Min = %.3f" % np.amin(X) print "Max = %.3f" % np.amax(X) print "Mean = %.3f" % np.mean(X) print "Median = %.3f" % np.median(X) print "Mode = %.3f" % utils.mode(X) # part 2: quantiles print "10th quantile = %.3f" % np.percentile(X, 10) print "25th quantile = %.3f" % np.percentile(X, 25) print "50th quantile = %.3f" % np.percentile(X, 50) print "75th quantile = %.3f" % np.percentile(X, 75) print "90th quantile = %.3f" % np.percentile(X, 90) # part 3: maxMean, minMean, maxVar, minVar means = np.mean(X, axis=0) vars = np.var(X, axis=0) print "Highest Mean at %s" % names[np.argmax(means)] print "Lowest Mean at %s" % names[np.argmin(means)] print "Highest Variance at %s" % names[np.argmax(vars)] print "Minimum Variance at %s" % names[np.argmin(vars)]
]) io_args = parser.parse_args() question = io_args.question if question == "1.1": # Load the fluTrends dataset df = pd.read_csv(os.path.join('..', 'data', 'fluTrends.csv')) X = df.values names = df.columns.values print(" Min: %s " % np.min(X)) print(" Max: %s " % np.max(X)) print(" Mean: %s " % np.mean(X)) print(" Median: %s " % np.median(X)) print(" Mode: %s " % utils.mode(X)) print(" 5%% quantile: %s " % np.percentile(X, 5)) print("25%% quantile: %s " % np.percentile(X, 25)) print("50%% quantile: %s " % np.percentile(X, 50)) print("75%% quantile: %s " % np.percentile(X, 75)) print("95%% quantile: %s " % np.percentile(X, 95)) print("Region with max mean: %s " % list(df.columns.values)[np.argmax(X.mean(0))]) print("Region with min mean: %s " % list(df.columns.values)[np.argmin(X.mean(0))]) print("Region with max variance: %s " % list(df.columns.values)[np.argmax(np.var(X, 0))]) print("Region with min variance: %s " % list(df.columns.values)[np.argmin(np.var(X, 0))]) elif question == "2":
def fit(self, X, y): """ YOUR CODE HERE """ N, D = X.shape # Get an array with the number of 0's, number of 1's, etc. count = np.bincount(y, None, minlength=2) # Get the index of the largest value in count. # Thus, y_mode is the mode (most popular value) of y y_mode = np.argmax(count) self.splitSat = y_mode self.splitNot = None self.splitVariable = None self.splitValue = None # If all the labels are the same, no need to split further if np.unique(y).size <= 1: return #(start) This part can be a function iniProb = np.bincount(y) / np.size(y) #print(iniProb) iniEntropy = entropy(iniProb) #(end) minInfoGain = iniEntropy maxInfoGain = 0 #print(iniEntropy) minError = np.sum(y != y_mode) #print(y) # Loop over features looking for the best split for d in range(D): for n in range(N): # Choose value to equate to value = X[n, d] # Find most likely class for each split y_spl0 = y[X[:, d] >= value] y_spl1 = y[X[:, d] < value] #print(y_spl1) if np.size(y_spl0) != 0: spl0Prob = np.bincount(y_spl0, None, 2) / np.size(y_spl0) elif np.size(y_spl0) == 0: spl0Prob = np.zeros(2) spl0Entropy = entropy(spl0Prob) wSpl0Entropy = (np.size(y_spl0) / np.size(y)) * spl0Entropy if np.size(y_spl1) != 0: spl1Prob = np.bincount(y_spl1, None, 2) / np.size(y_spl1) elif np.size(y_spl1) == 0: spl1Prob = np.zeros(2) spl1Entropy = entropy(spl1Prob) wSpl1Entropy = (np.size(y_spl1) / np.size(y)) * spl1Entropy infoGain = iniEntropy - wSpl0Entropy - wSpl1Entropy #print(infoGain) #print(testarray) #print(np.size(testarray)) y_sat = utils.mode(y[X[:, d] > value]) y_not = utils.mode(y[X[:, d] < value]) #if(y_sat==0):print(y_not) #print(y_not) # Make predictions y_pred = y_sat * np.ones(N) #if(y_sat==1):print(y_pred) y_pred[X[:, d] < value] = y_not # Compute error errors = np.sum(y_pred != y) # Compare to minimum error so far if (infoGain != 0 and infoGain > maxInfoGain): # This is the lowest error, store this value minError = errors maxInfoGain = infoGain #print(minInfoGain) self.splitVariable = d self.splitValue = value self.splitSat = y_sat self.splitNot = y_not
def predict_reducer(self, minimizer_ix): '''returns the average y's for the closest X's''' return mode(self.y[minimizer_ix])[0]
check_grad(grads.example, grads.example_grad) # check_grad(grads.foo, grads.foo_grad) # check_grad(grads.bar, grads.bar_grad) elif question == "5.1": # Load the fluTrends dataset df = pd.read_csv(os.path.join('..', 'data', 'fluTrends.csv')) X = df.values names = df.columns.values minValues = X.min() maxValues = X.max() meanValues = X.mean() medianValues = np.median(X) modeValues = utils.mode(X) print(minValues) print(maxValues) print(meanValues) print(medianValues) print(modeValues) five = np.quantile(X, 0.05) twentyFive = np.quantile(X, 0.25) fifty = np.quantile(X, 0.5) seventyFive = np.quantile(X, 0.75) nintyFive = np.quantile(X, 0.95) print(five) print(twentyFive)
def predict(example): return mode(predictor(example) for predictor in predictors)
def fit(self, X, y): N, D = X.shape # N is the num of examples , D is num of features # here X is a 400 * 2 matrix # here Y is a 400 * 1 matrix with value of 0 and 1 # print(N) here N = 400 # print(D) here D = 2 # Get an array with the number of 0's, number of 1's, etc. count = np.bincount(y) # here count = [234 166], num of 0 = 234, num of 1 = 166 # print(count) # Get the index of the largest label value in count. # which means y_mode = index(0 or 1) of which label value(0 or 1) has more # Thus, y_mode is the mode (most popular value) of y y_mode = np.argmax(count) # y_mode is most popular label value, here is 0 # print(y_mode) self.splitSat = y_mode # = 0 ????? self.splitNot = None # ????? self.splitVariable = None # split feature for Equality self.splitValue = None # ????? # If all the labels are the same, no need to split further if np.unique(y).size <= 1: return minError = np.sum(y != y_mode) # here minError = 166 # print(y) # print(y != y_mode) # print((minError)) # Loop over features looking for the best split X = np.round(X) # round each Xnd of X to int # value = X[1, 1] # print(value) # 32 # print(X[:, 1]) # y_sat = utils.mode(y[X[:, 1] == value]) # print(y_sat) # 1 # print(y[X[:, 1] == value]) # [1 1 1 1 1 1 0 1 1 1 1 1] # print(y[X[:, 1] != value]) # print(utils.mode(y[X[:, 1] != value])) # Find most likely class for each split # First, let's look at the X[:,d] == value. This is a condition, # it means that the d column of X which has the same value with "value" should be true. # then y[condition] gives you labels for the rows which are true # (labels of the points which satisfy the equality rule in decision stump). for d in range(D): # outer loop each feature for n in range(N): # inner loop each example # Choose value to equate to value = X[ n, d] # Xij, or we say Xnd, as our current equality value # Find most likely class for each split # "y[X[:,d] == value" gives the values in y where the corresponding Xnd = the current equality value # it returns a part of y # y_sat is the most appeared label value in y[X[:,d] == value] # y_not is the most appeared label value in y[X[:,d] != value] y_sat = utils.mode(y[X[:, d] == value]) y_not = utils.mode(y[X[:, d] != value]) # Make predictions y_pred = y_sat * np.ones(N) # = [1 1 1 ...] or [0 0 0 ...] y_pred[ X[:, d] != value] = y_not # change those y[X[:,d] != value] corresponding y[] to y_not # Compute error errors = np.sum(y_pred != y) # y_pred != y will give a array where not equivalent y_pred and y value will be 1 # Compare to minimum error so far if errors < minError: # This is the lowest error, store this value minError = errors self.splitVariable = d # change the split feature to the new feature self.splitValue = value # change the equality value to the new value self.splitSat = y_sat # predicted y if Xnd == splitValue self.splitNot = y_not # predicted y if Xnd != splitValue
def fit(self, X, y): N, D = X.shape # Get an array with the number of 0's, number of 1's, etc. count = np.bincount(y) # Get the index of the largest value in count. # Thus, y_mode is the mode (most popular value) of y y_mode = np.argmax(count) self.splitSat = y_mode self.splitNot = None self.splitVariable = None self.splitValue = None # Set information gain to 0 for baseline maxInfo = 0 # If all the labels are the same, no need to split further if np.unique(y).size <= 1: return # Loop over features looking for the best split by infoGain for d in range(D): for n in range(N): # Choose threshold for constraint X[:, d] > value value = X[n, d] # Compute the labels satisfying and not satisfying the constraint y_yes = y[X[:, d] > value] y_no = y[X[:, d] <= value] # Find most likely class for each split y_sat = utils.mode(y_yes) y_not = utils.mode(y_no) # Compute information gain n_yes = y_yes.size n_no = y_no.size classes = np.bincount(y).size dist = np.bincount(y, minlength=classes) dist_yes = np.bincount(y_yes, minlength=classes) dist_no = np.bincount(y_no, minlength=classes) # Note that entropy is 0 if there is no data a = b = c = 0 if np.sum(dist) != 0: a = entropy(dist / np.sum(dist)) if np.sum(dist_yes) != 0: b = entropy(dist_yes / np.sum(dist_yes)) if np.sum(dist_no) != 0: c = entropy(dist_no / np.sum(dist_no)) infoGain = a - n_yes * b / N - n_no * c / N # Compare to minimum error so far if infoGain > maxInfo: # This is the lowest error, store this value maxInfo = infoGain self.splitVariable = d self.splitValue = value self.splitSat = y_sat self.splitNot = y_not
"1.1", "2", "2.2", "2.3", "2.4", "3", "3.1", "3.2", "4.1", "4.2", "5" ]) io_args = parser.parse_args() question = io_args.question if question == "1.1": # retrieve max, min, median, mode ds = pd.read_csv('../data/fluTrends.csv') maximum = ds.values.max() minimum = ds.values.min() median = ds.stack().median() mode = utils.mode(ds.values) results = [maximum, minimum, median, mode] # retrieve quantiles print("quantiles: %s" % ds.stack().quantile([0.05, 0.25, 0.5, 0.75, 0.95])) # retrieve maximum mean, minimum mean, highest variance, lowest variance means = ds.mean() variances = ds.var() maxMean = means.idxmax(axis=0) minMean = means.idxmin(axis=0)
def predict(example): print([predictor(example) for predictor in predictors]) return mode(predictor(example) for predictor in predictors)
elif question == "5.1": # Load the fluTrends dataset df = pd.read_csv(os.path.join('..', 'data', 'fluTrends.csv')) X = df.values names = df.columns.values # ----------------- # 5.1.1 # ----------------- minimum = np.round(np.amin(X), decimals=4) maximum = np.round(np.amax(X), decimals=4) mean = np.round(np.mean(X), decimals=4) median = np.round(np.median(X), decimals=4) mode = np.round(utils.mode(X), decimals=4) print("\n-----------------------") print("Question 5.1.1\n") print("min = " + str(minimum)) print("max = " + str(maximum)) print("mean = " + str(mean)) print("median = " + str(median)) print("mode = " + str(mode)) # ----------------- # 5.1.2 # ----------------- quantiles = np.round(np.quantile(X, [0.05, 0.25, 0.5, 0.75, 0.95]), decimals=6) # Requires numpy version > 1.15
check_grad(grads.bar, grads.bar_grad) elif question == "5.1": # Load the fluTrends dataset df = pd.read_csv(os.path.join('..', 'data', 'fluTrends.csv')) X = df.values names = df.columns.values # YOUR CODE HERE # 1. The minimum, maximum, mean, median, # and mode of all values across the dataset. print("The minimum is: %s" % np.ndarray.min(X)) print("The maximum is: %s" % np.ndarray.max(X)) print("The mean is: %s" % np.ndarray.mean(X)) print("The median is: %s" % np.median(X)) print("The mode is %s" % utils.mode(X)) # 2. The 5%, 25%, 50%, 75%, and 95% quantiles # of all values across the dataset. print("The 5%% quantile is %s" % np.percentile(X, 5)) print("The 25%% quantile is %s" % np.percentile(X, 25)) print("The 50%% quantile is %s" % np.percentile(X, 50)) print("The 75%% quantile is %s" % np.percentile(X, 75)) print("The 95%% quantile is %s" % np.percentile(X, 95)) # 3. The names of the regions with the highest and lowest means, # and the highest and lowest variances. regionMean = np.mean(X, axis=0) regionVar = np.var(X, axis=0) print("The region {} has the highest mean of {} ".format( names[np.argmax(regionMean)], np.max(regionMean)))
choices=[ "1.1", "2", "2.2", "2.3", "2.4", "3", "3.1", "3.2", "4.1", "4.2", "5" ]) io_args = parser.parse_args() question = io_args.question if question == "1.1": dataset = utils.load_dataset("fluTrends") print("Minimum of dataset", ":", np.min(dataset[0])) print("Maximum of dataset", ":", np.max(dataset[0])) print("Mean of dataset", " :", np.mean(dataset[0])) print("Median of dataset", " :", np.median(dataset[0])) print("Mode of dataset", " :", utils.mode(dataset[0])) elif question == "2": # 1. Load citiesSmall dataset dataset = utils.load_dataset("citiesSmall") X = dataset["X"] y = dataset["y"] # 2. Evaluate majority predictor model y_pred = np.zeros(y.size) + utils.mode(y) error = np.mean(y_pred != y) print("Mode predictor error: %.3f" % error) # 3. Evaluate decision stump
def findOddScoutForListOfDicts(self, tempTIMDs, key1): #Similar to findOddScoutForDict, but for lists of dicts instead of individual dicts #The nth dict on each list should be the same weight = self.gradingListsOfDicts[key1][0] allScouts = filter(lambda v: v, map(lambda k: k.get('scoutName'), tempTIMDs)) # Unsorted meaning they can have different lengths unsortedLists = filter( lambda k: k, map(lambda t: t.get(key1) if t.get('scoutName') else None, tempTIMDs)) #Finds the mode for length of dicts and ignores if not that length #i.e. if there is disagreement over how many shots a robot took if unsortedLists != None: modeListLength = utils.mode([len(lis) for lis in unsortedLists ]) # finds mode, not max modeAmount = [len(lis) for lis in unsortedLists].count(modeListLength) #If someone missed an attempt or had an extra attempt, there is no way to compare their data #This filters out anything with a different length of dicts # 2018 - each dict is an attempt lists = [] aScouts = [] for aScoutIndex in range(len(unsortedLists)): if len(unsortedLists[aScoutIndex]) == modeListLength: lists.append(unsortedLists[aScoutIndex]) aScouts.append(allScouts[aScoutIndex]) elif modeAmount > 1: # Updates SPR if incorecct list amount and at least 2 scouts agree self.sprs.update({ allScouts[aScoutIndex]: (self.sprs.get(allScouts[aScoutIndex]) or 0) + weight }) self.disagreementBreakdown[allScouts[aScoutIndex]].update({ key1: { 'amount': (self.disagreementBreakdown[allScouts[aScoutIndex]] .get(key1, {}).get('amount', 0) + 1) } }) # Need at least 2 scouts to compare, or SPR is not affected if modeAmount > 1: # check here with if statement before runing code below for num in range(modeListLength): #Comparing dicts that should be the same (e.g. each shot time dict for the same shot) within the tempTIMDs #This means the nth shot by a given robot in a given match, as recorded by multiple scouts #The comparison itself is the same as the other findOddScout functions dicts = [lis[num] for lis in lists] scouts = [scout for scout in aScouts] values = [] for aDict in dicts: values += [aDict['didSucceed']] modeSuccess = utils.mode(values) if modeSuccess != None: popList = [] weight = self.gradingListsOfDicts[key1][1][ 'didSucceed'] for aDictIndex in range(len(dicts)): if dicts[aDictIndex]['didSucceed'] != modeSuccess: popList.append(aDictIndex) for item in popList[::-1]: #self.SPRBreakdown.update({key2: (self.SPRBreakdown.get(key2) or []) + [(differenceFromMode[c])]}) self.sprs.update({ scouts[item]: (self.sprs.get(scouts[item]) or 0) + weight }) self.disagreementBreakdown[scouts[item]].update({ key1: { 'didSucceed': (self.disagreementBreakdown[scouts[item]]. get(key1, {}).get('didSucceed', 0) + 1) } }) dicts.pop(item) scouts.pop(item) for key2 in dicts[0].keys(): #Strings can be averaged (we're just looking at mode, not subtracting them) #Without averaging, one person could be declared correct for no reason values = [aDict[key2] for aDict in dicts] weight = self.gradingListsOfDicts[key1][1][key2] mode = utils.mode(values) if mode != None: differenceFromMode = [ weight if v != mode else 0 for v in values ] #Gets inaccuracy by category for c in range(len(differenceFromMode)): self.SPRBreakdown.update({ key2: (self.SPRBreakdown.get(key2) or []) + [(differenceFromMode[c])] }) if weight != 0.0: self.sprs.update({ scouts[c]: (self.sprs.get(scouts[c]) or 0) + differenceFromMode[c] }) self.disagreementBreakdown[ scouts[c]].update({ key1: { key2: (self. disagreementBreakdown[ scouts[c]].get( key1, {}).get( key2, 0) + 1) } })
print(df.max()) print(df.min()) print(df.mean()) print(df.median()) print(df.mode()) elif question == "6": # 1Load citiesSmall dataset with open(os.path.join('..','data','citiesSmall.pkl'), 'rb') as f: dataset = pickle.load(f) X = dataset["X"] y = dataset["y"] # 2Evaluate majority predictor model y_pred = np.zeros(y.size) + utils.mode(y) error = np.mean(y_pred != y) print("Mode predictor error: %.3f" % error) # 3Evaluate decision stump model = DecisionStumpEquality() model.fit(X, y) y_pred = model.predict(X) error = np.mean(y_pred != y) print("Decision Stump with inequality rule error: %.3f" % error) # Plot result utils.plotClassifier(model, X, y)
def findOddScoutForListOfDictsDicts(self, tempTIMDs, key1): # Similar to findOddScoutForListOfDicts, but for a (dict in dict) in a list #The nth dict on each list should be the same weight = self.gradingListsOfDictsDicts[key1][0] allScouts = filter(lambda v: v, map(lambda k: k.get('scoutName'), tempTIMDs)) # Unsorted meaning they can have different lengths unsortedLists = [ tempTIMDs[tempTIMD].get(key1, []) for tempTIMD in range(len(tempTIMDs)) if tempTIMDs[tempTIMD].get('scoutName') ] #Finds the mode for length of dicts and ignores if not that length #i.e. if there is disagreement over how many shots a robot took if unsortedLists: lenList = [len(lis) for lis in unsortedLists] modeListLength = utils.mode(lenList) modeAmount = lenList.count(modeListLength) #If someone missed an attempt or had an extra attempt, there is no way to compare their data #This filters out anything with a different length of dicts # 2018 - each dict is an attempt # This is year specific code for 2018! lists = [] scouts = [] for aScoutIndex in range(len(unsortedLists)): if len(unsortedLists[aScoutIndex]) == modeListLength: lists.append(unsortedLists[aScoutIndex]) scouts.append(allScouts[aScoutIndex]) elif modeAmount > 1: # Updates SPR if incorecct list amount and at least 2 scouts agree self.sprs.update({ allScouts[aScoutIndex]: (self.sprs.get(allScouts[aScoutIndex]) or 0) + weight }) self.disagreementBreakdown[allScouts[aScoutIndex]].update({ key1: { 'amount': (self.disagreementBreakdown[allScouts[aScoutIndex]] .get(key1, {}).get('amount', 0) + 1) } }) # Need at least 2 scouts to compare, or SPR is not affected if modeAmount > 1: for num in range(modeListLength): #Comparing dicts that should be the same (e.g. each shot time dict for the same shot) within the tempTIMDs #This means the nth shot by a given robot in a given match, as recorded by multiple scouts #The comparison itself is the same as the other findOddScout functions dicts = [lis[num] for lis in lists] keys = [] for x in dicts: keys.append(x.keys()[0]) modeKey = max(set(keys), key=keys.count) modeKeyAmount = keys.count(modeKey) dicts2 = [] scouts2 = [] weight = self.gradingListsOfDictsDicts[key1][1] for index in range(len(dicts)): if dicts[index].keys()[0] == modeKey: dicts2.append(dicts[index]) scouts2.append(scouts[index]) else: self.sprs.update({ scouts[index]: (self.sprs.get(scouts[index]) or 0) + weight }) self.disagreementBreakdown[scouts[index]].update({ key1: { 'climbType': (self.disagreementBreakdown[scouts[index]]. get(key1, {}).get('climbType', 0) + 1) } }) # Must have 2 scouts to compare, or SPR is not affected if modeKeyAmount > 1: for key2 in dicts2[0].keys(): for key3 in dicts2[0][key2].keys(): #Strings can be averaged (we're just looking at mean, not subtracting them) #Without averaging, one person could be declared correct for no reason values = [] for aDict in dicts2: values += [aDict[key2][key3]] weight = self.gradingListsOfDictsDicts[key1][ 2][key2][key3] if weight != 0.0: mode = utils.mode(values) if mode: differenceFromMode = map( lambda v: weight if v != mode else 0, values) #Gets inaccuracy by category for c in range( len(differenceFromMode)): self.SPRBreakdown.update({ key2: (self.SPRBreakdown.get(key2) or []) + [(differenceFromMode[c])] }) self.sprs.update({ scouts2[c]: (self.sprs.get(scouts2[c]) or 0) + differenceFromMode[c] }) self.disagreementBreakdown[ scouts2[c]].update({ key1: { key2: { key3: (self. disagreementBreakdown[ scouts2[c]]. get(key1, {}).get( key2, {}).get( key3, 0) + 1) } } })
parser = argparse.ArgumentParser() parser.add_argument('-q','--question', required=True, choices=["1.1", "2", "2.2", "2.3", "2.4", "3", "3.1", "3.2", "4.1", "4.2", "5"]) io_args = parser.parse_args() question = io_args.question if question == "1.1": # Q1.1 - This should print the answers to Q 1.1 # Load the fluTrends dataset X, names = utils.load_dataset("fluTrends") # part 1: min, max, mean, median and mode results = ("Min: %.3f, Max: %.3f, Mean: %.3f, Median: %.3f, Mode: %.3f" % (np.min(X), np.max(X), np.mean(X), np.median(X), utils.mode(X))) print(results) # part 2: quantiles print("quantiles: %s" % np.percentile(X, [10, 25, 50, 75, 90])) # part 3: maxMean, minMean, maxVar, minVar means = np.mean(X, axis=0) variances = np.var(X, axis=0) results = ("maxMean: %s, minMean: %s, maxVar: %s, minVar: %s" % (names[np.argmax(means)], names[np.argmin(means)], names[np.argmax(variances)], names[np.argmin(variances)])) # part 4: correlation between columns corr = np.corrcoef(X.T)