def estimator_knn_cv(X, y, clf, n_neigh): neigh = NearestNeighbors(n_neigh, metric="euclidean", algorithm="brute") neigh_est = NearestNeighbors(n_neigh, metric="manhattan", algorithm="brute") acc = [] for train, test in StratifiedKFold(y, 5): X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] clf.fit(X_train, y_train) estimators = clf.estimators_ preds_train = np.array(map(lambda e: e.predict(X_train), estimators)).T preds_test = np.array(map(lambda e: e.predict(X_test), estimators)).T preds_train_proba = np.array(map(lambda e: e.predict_proba(X_train), estimators)) preds_test_proba = np.array(map(lambda e: e.predict_proba(X_test), estimators)) p_train = preds_train_proba.swapaxes(0, 1)[:, :, 0] p_test = preds_test_proba.swapaxes(0, 1)[:, :, 0] neigh.fit(X_train) dist, knn = neigh.kneighbors(X_test) neigh_est.fit(preds_train) dist, knn_est = neigh_est.kneighbors(preds_test) # neigh_est.fit(p_train);dist, knn_est = neigh_est.kneighbors(p_test) knn_combined_uniq = np.array(map(np.unique, np.hstack((knn[:, :30], knn_est[:, :30])))) pp_uniq = np.array([stats.mode(y_train[nn])[0][0] for nn in knn_combined_uniq]) # pp_uniq = np.array([stats.mode(y_train[nn])[0][0] for nn in knn[:,:30]]) preds_test_est_knn = np.array( [[stats.mode(y_train[nn])[0][0] for nn in knn_est[:, :i]] for i in xrange(1, n_neigh, 2)] ) acc.append( [accuracy_score(y_test, pred) for pred in np.vstack((preds_test_est_knn, clf.predict(X_test), pp_uniq))] ) mean_acc = np.mean(acc, axis=0) print " ".join("{:.3f}".format(v) for v in mean_acc), " max:{:.3f}".format(mean_acc.max())
def predictedGroup(p, tr, nn = 3, e='s'): #Compares the points (p) to your tree (tr), providing its predicted # category for each point based off of its (nn) nearest neighbors #Explaination can be short e[s] or long e[l] k = ann.kdtree(tr[:,:tr.shape[1]-1]) l = k.knn(p[:,:p.shape[1]-1],3) # dist = distGroups(tr) # pr = [] print "l[0]"; print l[0], "\n"; # print dist; # print "tr[0][-1] \n",tr[0][-1] if e == 's': for i in l[0]: pass else: print tr,"\n"; print p,"\n"; ll = np.zeros((l[0].shape[0],p.shape[1])) kk = 0 for i in l[0]: ii = 0 for j in i: ll[kk][ii] = tr[j][-1] ii += 1 kk += 1 print "Groups \n", ll; print "Modes \n",stat.mode(ll,1) pred = assignGroup(p,stat.mode(ll,1)[0]) print pred return pred
def test_03_02_circle(self): '''Test the module on a uniform circle''' i,j = np.mgrid[-50:51,-50:51] labels = (np.sqrt(i*i+j*j) <= 40).astype(int) m, workspace = self.run_module( np.ones(labels.shape), labels, wants_workspace=True) assert isinstance(workspace, cpw.Workspace) bins = labels * (1 + (np.sqrt(i*i+j*j) / 10).astype(int)) for bin in range(1,5): data = m.get_current_measurement(OBJECT_NAME, feature_frac_at_d(bin, 4)) self.assertEqual(len(data), 1) area = (float(bin) * 2.0 - 1.0)/16.0 self.assertTrue(data[0] > area - .1) self.assertTrue(data[0] < area + .1) heatmap = workspace.image_set.get_image( HEAT_MAP_NAME + M.F_FRAC_AT_D).pixel_data data = data.astype(heatmap.dtype) self.assertEqual(mode(heatmap[bins == bin])[0][0], data[0]) data = m.get_current_measurement(OBJECT_NAME, feature_mean_frac(bin, 4)) self.assertEqual(len(data), 1) self.assertAlmostEqual(data[0], 1, 2) heatmap = workspace.image_set.get_image( HEAT_MAP_NAME + M.F_MEAN_FRAC).pixel_data data = data.astype(heatmap.dtype) self.assertEqual(mode(heatmap[bins == bin])[0][0], data[0]) data = m.get_current_measurement(OBJECT_NAME, feature_radial_cv(bin, 4)) self.assertEqual(len(data), 1) self.assertAlmostEqual(data[0], 0, 2) heatmap = workspace.image_set.get_image( HEAT_MAP_NAME + M.F_RADIAL_CV).pixel_data data = data.astype(heatmap.dtype) self.assertEqual(mode(heatmap[bins == bin])[0][0], data[0])
def statistics(): global data data=[i.split(',') for i in data.splitlines()] column_names = data[0] data_rows = data[1:] df = pd.DataFrame(data_rows, columns = column_names) df["Alcohol"]=df["Alcohol"].astype(float) df["Tobacco"]=df["Tobacco"].astype(float) print "Alcohol dataset stats:" print "Mean = ",df['Alcohol'].mean() print "Median =",df['Alcohol'].median() print "Mode =", stats.mode(df["Alcohol"]) print "Range =",max(df['Alcohol']) - min(df['Alcohol']) print "Variance =",df['Alcohol'].var() print "Standard Deviation =",df['Alcohol'].std() print "\n" print "Tobacco dataset stats:" print "Mean = ",df['Tobacco'].mean() print "Median =",df['Tobacco'].median() print "Mode =",stats.mode(df["Tobacco"])[0][0] print "Range =",max(df['Tobacco']) - min(df['Tobacco']) print "Variance =",df['Tobacco'].var() print "Standard Deviation =",df['Tobacco'].std()
def give_me_seperation_and_repitition(min_list): seperation_list=[] sep_and_repeat_list = [] for i in range(1,len(min_list)): diff = min_list[i] - min_list[i-1] seperation_list.append(diff) #print seperation_list repeated_seperation = mode(seperation_list) while repeated_seperation[1][0] > 1: sep_and_repeat_list.append([repeated_seperation[0][0],repeated_seperation[1][0]]) while repeated_seperation[0][0] in seperation_list : seperation_list.remove(repeated_seperation[0][0]) repeated_seperation = mode(seperation_list) #print sep_and_repeat_list if sep_and_repeat_list != []: for i in range(len(sep_and_repeat_list)): #print sep_and_repeat_list[i][1] if sep_and_repeat_list[i][1] / len(seperation_list) < 0.1 : # it means less than this percent sep_and_repeat_list[i] = [0,0] while [0, 0] in sep_and_repeat_list : sep_and_repeat_list.remove([0,0]) return sep_and_repeat_list
def lesson(): global data data=[i.split(',') for i in data.splitlines()] column_names = data[0] data_rows = data[1:] df = pd.DataFrame(data_rows, columns = column_names) df["Alcohol"]=df["Alcohol"].astype(float) df["Tobacco"]=df["Tobacco"].astype(float) print df['Alcohol'].mean() print df['Alcohol'].median() print stats.mode(df["Alcohol"])[0][0] print '\n' print df['Tobacco'].mean() print df['Tobacco'].median() print stats.mode(df["Tobacco"])[0][0] print '\n' print max(df['Alcohol']) - min(df['Alcohol']) print df['Alcohol'].std() print df['Alcohol'].var() print '\n' print max(df['Tobacco']) - min(df['Tobacco']) print df['Tobacco'].std() print df['Tobacco'].var() print "\n" ##z-score of Utopian people mean = 251 std = 20 x = 2.3*std+mean print "the days corresponding to a z-score of 2.3 is",x
def add_instance(self, instance, target, representation=None): """Adds the given instance, target and representation to the corpus. Args: instance: a vector with shape equals to self.instances.shape[1] target: a list of string representing the classes. representation: a string """ if isinstance(self.instances, scipy.sparse.csr.csr_matrix): instance = csr_matrix(instance) self.instances = vstack((self.instances, instance), format='csr') else: self.instances = csr_matrix(instance) self.full_targets.append(target) self.representations.append(representation) if target: if mode(target)[1][0] != 1: self.primary_targets.append(mode(target)[0][0]) else: self.primary_targets.append(target[0]) else: self.primary_targets.append(None) for key in self.extra_info: self.extra_info[key].append(0)
def back_to_numbers(class_scores,scores_to_lables_lists,numclass): from scipy.stats import mode back_to_values_mean = np.zeros(len(class_scores)) back_to_values_mode_small = np.zeros(len(class_scores)) back_to_values_mode_larger = np.zeros(len(class_scores)) back_to_values_median = np.zeros(len(class_scores)) back_to_values_max = np.zeros(len(class_scores)) back_to_values_min = np.zeros(len(class_scores)) lables = ['A','B','C','D','E','F','G','H','I','J','K'] numbers_lables_dict = dict() for j in range(0,11): numbers_lables_dict[lables[j]] = j for i in range(len(class_scores)): cs = class_scores[i] bin = numbers_lables_dict[cs] back_to_values_mean[i] = np.array(scores_to_lables_lists[bin]).mean() back_to_values_mode_small[i] = mode(scores_to_lables_lists[bin])[0][0] back_to_values_mode_larger[i] = mode(scores_to_lables_lists[bin])[1][0] back_to_values_median[i] = np.median(scores_to_lables_lists[bin]) back_to_values_max[i] = np.array(scores_to_lables_lists[bin]).max() back_to_values_min[i] = np.array(scores_to_lables_lists[bin]).min() return [back_to_values_mean,back_to_values_mode_small,back_to_values_mode_larger,back_to_values_median,back_to_values_max,back_to_values_min ]
def predict(self, X): """Predict the class labels for the provided data Parameters ---------- X: array A 2-D array representing the test points. Returns ------- labels: array List of class labels (one for each data sample). """ X = atleast2d_or_csr(X) if self.classification_type == 'knn_vote': neigh_ind = self.kneighbors(X, return_distance=False) pred_labels = self._y[neigh_ind] mode, _ = stats.mode(pred_labels, axis=1) return mode.flatten().astype(np.int) else: neigh_ind = self.radius_neighbors(X, return_distance=False) pred_labels = [self._y[ind] for ind in neigh_ind] return np.asarray([stats.mode(pi) for pi in pred_labels], dtype=np.int)
def make_fused_set(features, labels, files, data_set, target_file): #Start fusing subject_idx = [] subject_vol_dict = collections.defaultdict(list) subject_class_dict = collections.defaultdict(list) j=0 #find volume indices for each unique subject for i in files: paresd_str = str(i).split('_') subject_id = int(paresd_str[4]) subject_idx.append(subject_id) subject_vol_dict[subject_id].append(j) subject_class_dict[subject_id].append(labels[j]) j=j+1 print len(subject_vol_dict), len(subject_class_dict) fuse_vol_array = np.zeros((len(set(subject_idx)),features.shape[1])) fuse_class_array = np.zeros(len(set(subject_idx))) j=0 for i in set(subject_idx): fuse_vol = stats.mode(features[subject_vol_dict[i]]) fuse_vol_array[j,:] = fuse_vol[0] fuse_class_array[j] = stats.mode(subject_class_dict[i])[0] j=j+1 #Save fused files output_data = h5.File(target_file, 'a') output_data.create_dataset('{}_data_fused'.format(data_set), data=fuse_vol_array) output_data.create_dataset('{}_class_fused'.format(data_set), data=fuse_class_array) output_data.close()
def new_data(data): #{ z=[]; #used to store the index of zeroed elements m,n=numpy.shape(data); #get the dimensions of the matrix y=int(data[0,n-1]); #extract the classification from the last column sim_length = numpy.zeros(m) #create a 2 element array to store the lengths for i in range(m): z.append(numpy.nonzero(data[i,:]==0)[0]) if len(z[i])>0: sim_length[i] = z[i][0] else: sim_length[i] = n longest = max(sim_length) shortest = min(sim_length) i=0 for i in range(m): length = sim_length[i] if length<longest: stuffing = int(longest-length) start=int(numpy.floor(length/2)) x= numpy.zeros((stuffing-1,n),dtype=int) in_proc_stretch = numpy.vstack((data[1,:],x)) offset = start; for j in range(1,stuffing): in_proc_stretch[j,0:offset]=in_proc_stretch[j-1,0:offset] md = stats.mode(data[0,0:sim_length[0]-1]) in_proc_stretch[j,offset]= md[0]+numpy.random.random_integers(-3,3) in_proc_stretch[j,offset+1:length+1] = in_proc_stretch[j-1,offset:length] length=length+1 offset=offset+1 else: shrinkage = int(length-shortest) x = numpy.zeros((shrinkage-2,n),dtype=int) in_proc_shrink = numpy.vstack((data,x)) in_proc_shrink[1,:]= 0 for j in range(0,shrinkage-1): md = stats.mode(in_proc_shrink[j,0:length]) ind = numpy.nonzero(in_proc_shrink[j,0:length]==md[0]) in_proc_shrink[j+1,0:ind[0][-1]] = in_proc_shrink[j,0:ind[0][-1]] in_proc_shrink[j+1,ind[0][-1]:-2] = in_proc_shrink[j,(ind[0][-1]+1):-1] length = length-1 in_proc_stretch[:,n-1]=y in_proc_stretch[:,0:start]=data[0,0:start] print in_proc_stretch in_proc_shrink[:,n-1]=y in_proc_shrink[:,0:start]=data[1,0:start] print in_proc_shrink sim_data = numpy.vstack((in_proc_stretch,in_proc_shrink)) return sim_data
def Assign_Parameters_Semidistributed(covariates,metadata,hydrobloks_info,OUTPUT,cluster_ids,mask): nclusters = hydrobloks_info['nclusters'] #Initialize the arrays vars = ['area','area_pct','BB','DRYSMC','F11','MAXSMC','REFSMC','SATPSI', 'SATDK','SATDW','WLTSMC','QTZ','slope','ti','dem','carea','channel', 'land_cover','soil_texture_class', 'mannings','m','psoil','pksat','sdmax'] OUTPUT['hsu'] = {} for var in vars: OUTPUT['hsu'][var] = np.zeros(nclusters) #Metadata #NLCD2NOAH = {11:17,12:15,21:10,22:10,23:10,24:13,31:16,41:4,42:1,43:5,51:6,52:6,71:10,72:10,73:19,74:19,81:10,82:12,90:11,95:11} for hsu in np.arange(nclusters): #Set indices idx = np.where(cluster_ids == hsu) #Calculate area per hsu OUTPUT['hsu']['area'][hsu] = metadata['resx']**2*idx[0].size #Calculate area percentage per hsu OUTPUT['hsu']['area_pct'][hsu] = 100*OUTPUT['hsu']['area'][hsu]/(metadata['resx']**2*mask[mask].size) #Soil properties for var in ['BB','DRYSMC','F11','MAXSMC','REFSMC','SATPSI','SATDK','SATDW','WLTSMC','QTZ']: if var in ['SATDK','SATDW']: OUTPUT['hsu'][var][hsu] = stats.mstats.hmean(covariates[var][idx]) else: OUTPUT['hsu'][var][hsu] = stats.mstats.gmean(covariates[var][idx]) #OUTPUT['hsu']['SATDW'][hsu] = 0.0 #Average Slope OUTPUT['hsu']['slope'][hsu] = np.mean(covariates['cslope'][idx]) #print 'mean',np.mean(np.sin(covariates['cslope'][idx])) #print 'arcsin mean',np.arcsin(np.mean(np.sin(covariates['cslope'][idx]))) #OUTPUT['hsu']['slope'][hsu] = np.arcsin(np.mean(np.sin(covariates['cslope'][idx]))) #print np.min(covariates['cslope'][idx]),np.mean(covariates['cslope'][idx]),np.max(covariates['cslope'][idx]) #print OUTPUT['hsu']['slope'][hsu] #Topographic index OUTPUT['hsu']['ti'][hsu] = np.mean(covariates['ti'][idx]) #DEM OUTPUT['hsu']['dem'][hsu] = np.mean(covariates['dem'][idx]) #Average Catchment Area OUTPUT['hsu']['carea'][hsu] = np.mean(covariates['carea'][idx]) #Channel? OUTPUT['hsu']['channel'][hsu] = stats.mode(covariates['channels'][idx])[0] #Land cover type #OUTPUT['hsu']['land_cover'][hsu] = NLCD2NOAH[95]#stats.mode(covariates['nlcd'][idx])[0][0]] OUTPUT['hsu']['land_cover'][hsu] = stats.mode(covariates['nlcd'][idx])[0][0] #Soil texture class OUTPUT['hsu']['soil_texture_class'][hsu] = stats.mode(covariates['TEXTURE_CLASS'][idx])[0][0] #Define the estimate for the model parameters OUTPUT['hsu']['m'][hsu] = 0.1 #Form of the exponential decline in conductivity (0.01-1.0) OUTPUT['hsu']['pksat'][hsu] = 1.0 #saturated hydraulic conductivity scalar multiplier (0.1-1.0) OUTPUT['hsu']['psoil'][hsu] = 1.0 #soil hydraulic properties (residual,wilting,field capacity, and porosity) (0.1-10.0) OUTPUT['hsu']['sdmax'][hsu] = 5.0 #maximum effective deficit of subsurface saturated zone (0.1-10.0) if np.max(covariates['carea'][idx]) >= 100000.0: OUTPUT['hsu']['mannings'][hsu] = 0.03 #manning's n for channel flow (0.01-0.1) else: OUTPUT['hsu']['mannings'][hsu] = 0.15 #manning's n for overland flow (0.01-0.8) return OUTPUT
def classify_by_ola_proba(local_accuracy, pred): sorted_acc = np.sort(np.unique(local_accuracy))[::-1] for a in sorted_acc: acc_indices = np.where(local_accuracy == a)[0] val, count = stats.mode(pred[acc_indices]) if count[0] > acc_indices.shape[0] / 2: return val[0] print "tie" return stats.mode(pred)[0][0]
def calculate_purity(self, vector_purity,labels,epoch,gamma=None): labels = labels[0:self.n_samples] if gamma== None: for i in range(self.n_clusters): vector_purity[epoch,i] = (mode(self.y[np.where(labels == i)])[1]/np.where(labels == i)[0].shape[0])[0] return vector_purity else: for i in range(self.n_clusters): vector_purity[epoch,i,gamma] = (mode(self.y[np.where(labels == i)])[1]/np.where(labels == i)[0].shape[0])[0] return vector_purity
def statresults(agdia, audia): # Statistical report dfinal = pd.DataFrame([[len(audia)],[len(agdia)]], columns=['Total particles counted'], index=['AuNP','AgNP']) dfinal['Mean Diameter (nm)'] = np.round([np.mean(audia),np.mean(agdia)],1) dfinal['Median Diameter (nm)'] = np.round([np.median(audia),np.median(agdia)],1) dfinal['Mode Diameter (nm)'] = np.round([mode(audia.tolist())[0], mode(agdia.tolist())[0]],1) return dfinal
def reed_muller_decode(blocks): x1 = R[1] x2 = R[2] x3 = R[3] notx1 = (~x1 % 2) notx2 = (~x2 % 2) notx3 = (~x3 % 2) # Characteristic vectors and coefficients of x3 x3_1 = np.logical_and(x1,x2) % 2 x3_2 = np.logical_and(x1,notx2) % 2 x3_3 = np.logical_and(notx1,x2) % 2 x3_4 = np.logical_and(notx1,notx2) % 2 v3 = np.array([x3_1,x3_2,x3_3,x3_4]) all3 = np.dot(v3, blocks) % 2 # c3 = stats.mode(all3)[0] (mode3, count3) = stats.mode(all3) c3 = ((mode3 == 1) * (count3 > all3.shape[0]/2)) % 2 # Characteristic vectors and coefficients of x2 x2_1 = np.logical_and(x1,x3) % 2 x2_2 = np.logical_and(x1,notx3) % 2 x2_3 = np.logical_and(notx1,x3) % 2 x2_4 = np.logical_and(notx1,notx3) % 2 v2 = np.array([x2_1,x2_2,x2_3,x2_4]) all2 = np.dot(v2, blocks) % 2 # c2 = stats.mode(all2)[0] (mode2, count2) = stats.mode(all2) c2 = ((mode2 == 1) * (count2 > all2.shape[0]/2)) % 2 # Characteristic vectors and coefficients of x1 x1_1 = np.logical_and(x2,x3) % 2 x1_2 = np.logical_and(x2,notx3) % 2 x1_3 = np.logical_and(notx2,x3) % 2 x1_4 = np.logical_and(notx2,notx3) % 2 v1 = np.array([x1_1,x1_2,x1_3,x1_4]) all1 = np.dot(v1, blocks) % 2 # c1 = stats.mode(all1)[0] (mode1, count1) = stats.mode(all1) c1 = ((mode1 == 1) * (count1 > all1.shape[0]/2)) % 2 # Calculate coefficient of 0th row coefficients = np.concatenate((c1, c2, c3),axis=0) dotted = np.dot(coefficients.T, np.array([x1, x2, x3])).T % 2 all0 = (dotted + blocks) % 2 # If more 1's, then 1. Otherwise, 0. # This also handles case when equal number of 1's and 0's -> Set to 0 (mode0, count0) = stats.mode(all0) c0 = ((mode0 == 1) * (count0 > all0.shape[0]/2)) % 2 decoded = np.concatenate([c0, c1, c2, c3],axis=0) decoded = decoded.astype(int) return decoded
def _get_prf(res_set): res_set=np.array(res_set) modes=[] precs=[] recs=[] for res in res_set: modes.append(mode(res)[0][0]) precs.append(mode(res)[0][0]/len(res)) for m in modes: m=0
def calculate_purity(self, labels,epoch,n_iter=0,gamma=None): labels = labels[0:self.n_samples] if gamma== None: for i in range(self.n_clusters): self.purity['kmeans'][epoch,i,n_iter] = \ (mode(self.y[np.where(labels == i)])[1]/np.where(labels == i)[0].shape[0])[0] return self.purity['kmeans'] else: for i in range(self.n_clusters): self.purity['kernelkmeans'][epoch,i,gamma,n_iter] =\ (mode(self.y[np.where(labels == i)])[1]/np.where(labels == i)[0].shape[0])[0] return self.purity['kernelkmeans']
def camera_spot(i, D): # split the distance histogram at the elbow between its linear # region and its exponential region elbow = np.argmin(D-i) j, Dl = index_normalize(D[elbow:]) k, Dr = index_normalize(D[:elbow]) # now compute the mode count for each region and divide by the size # of the region. if either mode count is high, that likely indicates # a camera spot mcl = 1. * stats.mode(Dl)[1][0] / Dl.size mcr = 1. * stats.mode(Dr)[1][0] / Dr.size return max(mcl, mcr)
def homogeneity(labels1, labels2): num_missed = 0.0 for label in set(labels1): matches = labels2[labels1 == label] match_mode = mode(matches).mode[0] num_missed += np.sum(matches != match_mode) for label in set(labels2): matches = labels1[labels2 == label] match_mode = mode(matches).mode[0] num_missed += np.sum(matches != match_mode) return num_missed / 2.0
def compute(self, do_pdf='yes'): for i in xrange(self.nD): self.bigpdf = zeros(len(self.zbins)) if self.dict_zp.has_key(i): out = array(self.dict_zp[i]['zp']) #wout=array(self.dict_zp[i]['wp']) if self.dict_zp[i].has_key('zs'): self.zs[i] = self.dict_zp[i]['zs'] if self.Pars.predictionclass == 'Reg': for zpi in xrange(len(out)): mybin = int(floor(out[zpi] / self.resz)) if mybin > self.Nbins - 1: continue self.bigpdf[mybin] += 1. pdf = self.bigpdf pdf2 = interp(self.zfine2, self.zbins, pdf) pdf2 = where(greater(pdf2, max(pdf2) * 0.01), pdf2, 0.) pdf2 = convolve(pdf2, self.gaus2, 1) pdf2 = where(greater(pdf2, max(pdf2) * 0.005), pdf2, 0.) if sum(pdf2) > 0.: pdf2 /= sum(pdf2) self.zs0[i] = self.zfine2[argmax(pdf2)] self.zs0[i] = min(self.zs0[i], self.Pars.maxz) self.zs0[i] = max(self.zs0[i], self.Pars.minz) self.zs1[i] = sum(self.zfine2 * pdf2) self.zs1[i] = min(self.zs1[i], self.Pars.maxz) self.zs1[i] = max(self.zs1[i], self.Pars.minz) if do_pdf == 'yes': self.err0[i] = utils_mlz.compute_error(self.zfine2, pdf2, self.zs0[i]) self.err1[i] = utils_mlz.compute_error(self.zfine2, pdf2, self.zs1[i]) self.zConf0[i] = utils_mlz.compute_zConf(self.zfine2, pdf2, self.zs0[i], self.Pars.rmsfactor) self.zConf1[i] = utils_mlz.compute_zConf(self.zfine2, pdf2, self.zs1[i], self.Pars.rmsfactor) pdf2 = pdf2[self.wzin] if sum(pdf2) > 0.: pdf2 /= sum(pdf2) self.bigpdf2[i, :] = pdf2 if self.Pars.predictionclass == 'Class': if len(out) > 0: self.zs0[i] = mode(out * 1.)[0][0] self.zs1[i] = mean(out * 1.) if len(out) > 0.: self.err0[i] = mode(out * 1.)[1][0] * 1. / (len(out) * 1.) self.err1[i] = std(out * 1.) bigZ = zeros((self.nD, 7)) bigZ[:, 0] = self.zs bigZ[:, 1] = self.zs0 bigZ[:, 2] = self.zs1 bigZ[:, 3] = self.zConf0 bigZ[:, 4] = self.zConf1 bigZ[:, 5] = self.err0 bigZ[:, 6] = self.err1 if do_pdf == 'no': return bigZ else: return bigZ, self.bigpdf2
def getSlidingWindowModes(windowSize): global dataMax assert len(dataMax) > 0 assert windowSize <= len(dataMax) modes = [] nb_windows = len(dataMax) - windowSize + 1 datalet = dataMax[:windowSize] modes.append(stats.mode(datalet)[0][0]) for i in range(1, nb_windows): datalet.pop(0) datalet.append(dataMax[i]) modes.append(stats.mode(datalet)[0][0]) print "modes", modes return modes
def predict(self, entities, flag=False): #if depth > 0, object is an entity list. otherwise, problem!!!! NA_VAL= -1 if self.is_svm: transformed_obj= apply_transforms(self.relations, self.transforms, [entities]) if flag: transformed_obj= apply_transforms_other(self.relations, self.transforms[-1:], [entities]) self.table= zeros((len(transformed_obj), len(self.features))) for j,new_feature in enumerate(self.features): self.table[:, j]= array([new_feature(ent) for ent in transformed_obj]) return int(mode(self.query_tree.predict(self.table))[0][0]) curr_node= self.query_tree if curr_node.chosen_tag is None:#edge case in the case of consistent return 0#some arbitrary rule while curr_node.chosen_query is not None: if len(curr_node.sons.keys())==1: #only one son curr_node=curr_node.sons[curr_node.sons.keys()[0]] continue transformed_obj= apply_transforms(curr_node.relations, curr_node.transforms, [entities]) if flag: transformed_obj= apply_transforms_other(curr_node.relations, curr_node.transforms[-1:], [entities]) query_val= None #print transformed_obj if len(transformed_obj[0])==0: #no entities query_val= NA_VAL if len(self.transforms)>0: return NA_VAL #if not lvl0, return -1 for this elif not curr_node.is_rec: query_val= curr_node.chosen_query(transformed_obj[0]) else: vals=[] if len(self.transforms)==0: #need apply trans vals= [curr_node.chosen_query([x]) for x in transformed_obj[0] if len(apply_transforms(curr_node.relations, curr_node.justify.transforms, [x])[0])>0] else: vals= [curr_node.chosen_query([x]) for x in transformed_obj[0] if len(apply_transforms_other(curr_node.relations, curr_node.justify.transforms[-1:], [x])[0])>0] if len(vals)>0: query_val= int(mode(vals)[0][0]) #ISSUE: mode is problem if equal... else: query_val= NA_VAL #query for tree is -1 tmp= int(curr_node.chosen_tag) curr_node=curr_node.sons.get(query_val) if curr_node is None: #tried tree that has no N/A in train, but does in test/ example was [] return tmp #best possible guess return int(curr_node.chosen_tag)
def classify(self, data): nan_cols = np.arange(self.n_features)[np.isnan(data)] decisions = [] s1 = set(nan_cols) for i in range(self.n_trees): cols = self.col_list[i] s2 = set(cols) if len(s1.intersection(s2)) > 0: #decisions[i] = -1 continue decisions.append(self.bags[i].predict(data[cols])) if (len(decisions) == 0): return (-1, 0, 0) return (mode(decisions)[0][0][0], mode(decisions)[1][0][0], len(decisions))
def aggregate_VOTE(self, cv_prediction_matrix, final_prediction_matrix, y_train, y_test): final_accuracy_test, final_accuracy_train_cv = 0, 0 final_accuracy_classifier_index = -1 cv_accuracies = [0] test_accuracies = [] #claculate acccuracy for the first classifier test_accuracy_till_now = np.sum(final_prediction_matrix[0] + 1 == y_test)/float(len(y_test)) test_accuracies.append(test_accuracy_till_now) for classifier_index in range(1, self.number_of_classifiers): cv_accuracy_till_now = 0 cv = StratifiedKFold(y = y_train, n_folds = self.config.configuration["number_of_cv_folds"]) cv_labels_till_now = mode(cv_prediction_matrix[0:classifier_index], axis = 0)[0][0] for train_index, test_index in cv: y_train_cv, y_test_cv = y_train[train_index], y_train[test_index] cv_accuracy_till_now += np.sum(cv_labels_till_now[test_index] + 1 == y_test_cv)/float(len(y_test_cv)) cv_accuracy_till_now /= float(self.config.configuration['number_of_cv_folds']) cv_accuracies.append(cv_accuracy_till_now) # print str(classifier_index) + " cv accuracy: " + str(cv_accuracy_till_now) test_labels_till_now = mode(final_prediction_matrix[0:classifier_index], axis = 0)[0][0] test_accuracy_till_now = np.sum(test_labels_till_now + 1 == y_test)/float(len(y_test)) test_accuracies.append(test_accuracy_till_now) # print "test predictions till now: " + str(test_accuracy_till_now) #termination condition if classifier_index > self.window_size_for_termination: final_accuracy_test, final_accuracy_train_cv, final_accuracy_classifier_index = self.check_threshold(cv_accuracies, classifier_index, test_accuracy_till_now, final_accuracy_test, final_accuracy_train_cv, final_accuracy_classifier_index) # if final_accuracy_classifier_index != -1: # break if final_accuracy_classifier_index == -1: final_accuracy_test, final_accuracy_train_cv = test_accuracies[self.number_of_classifiers - 1], cv_accuracies[self.number_of_classifiers - 1] final_accuracy_classifier_index = self.number_of_classifiers print self.bo_selection_type, self.subject, final_accuracy_test, final_accuracy_train_cv, final_accuracy_classifier_index with open('../n_classifiers.txt', 'a') as f: f.write(self.subject + self.bo_selection_type + str(final_accuracy_classifier_index) + '\n') return final_accuracy_test, final_accuracy_train_cv, final_accuracy_classifier_index, cv_accuracies, test_accuracies
def predict(self, X): """Predict the class labels for the provided data Parameters ---------- X: array A 2-D array representing the test points. Returns ------- labels: array List of class labels (one for each data sample). """ X = atleast2d_or_csr(X) neigh_dist, neigh_ind = self.kneighbors(X) pred_labels = self._y[neigh_ind] weights = _get_weights(neigh_dist, self.weights) if weights is None: mode, _ = stats.mode(pred_labels, axis=1) else: mode, _ = weighted_mode(pred_labels, weights, axis=1) return self.classes_.take(mode.flatten().astype(np.int))
def knnClassifier(training_data, test_data, training_target, test_target, k=5): #normalize the data #calculate the z-score of the data #print training_data training_data = training_data new_training_data = stats.zscore(training_data.astype(int), axis=0) new_test_data = stats.zscore(test_data.astype(int), axis=0) #find the k nearest neighbors for each test data #print 'test', new_test_data predictions = [] for test in new_test_data: #print test # find the euclidean distance between the test case and all training cases distances = [] neighbors = [] neighbor_predictions = [] for train in new_training_data: #print train distances.append(np.linalg.norm(train-test)) #print distances for i in range(k): neighb_i = distances.index(min(distances)) neighbors.append(neighb_i) distances[neighb_i] = 1000000 #print neighbors for neighb in neighbors: neighbor_predictions.append(training_target[neighb]) predictions.append(stats.mode(neighbor_predictions)[0][0]) return predictions
def calc_modes(N2, bottom_depth, z_bins): """Wave velocity and structure of first three modes""" dz = np.mean(np.diff(z_bins)) # Truncate N2 to appropriate length based on depth and dz Nz = (bottom_depth/dz).astype(int) N2 = N2[:Nz] # Find indices of start and end of finite values finite_vals = nan_or_masked(N2) == 0 labels = label(finite_vals)[0] main_data = np.where(labels == mode(labels[finite_vals]))[1] start_ind, end_ind = main_data[0], main_data[-1] # Fill in NaN values with start or end values N2[:start_ind] = N2[start_ind] N2[end_ind + 1:] = N2[end_ind] # Preallocate arrays for horizontal and vertical structure hori = np.full((len(z_bins) - 1, 3), np.nan) vert = hori.copy() hori[:len(N2), :], vert[:len(N2), :], c, _ = vertModes(N2, dz, 3) return hori, vert, c[:3]
def select_downcast(pressure, for_overturn_calcs=False): """Find indices of the downcast part of data""" # Take the derivative of the pressure profile dp = np.diff(pressure) # Constants for the filter B, A = signal.butter(2, 0.01, output='ba') # Filter the pressure derivative, to smooth out the curve dp_smooth = signal.filtfilt(B, A, dp) # Make the arrays the same size dp_smooth = np.append(dp_smooth, [0]) # Find the indices where the descent rate is more than 0.05 falling_inds = dp_smooth > 0.05 if for_overturn_calcs: # For overturns, we want fall to be smooth. # Therefore, we want to exclude portions near the surface where # fall rate may drop below 0.05. In such cases without the code below # we would end up with discontinuous pieces of the profile inds_label = label(falling_inds)[0] inds_label_mode = mode(inds_label[falling_inds])[0] falling_inds[inds_label != inds_label_mode] = False falling_inds = np.where(falling_inds)[0] return falling_inds
def build_tree(data, labels, word_data, level): if (level == 0): #return label value which is dominant return LabelConv[st.mode(labels)[0][0]-1]; #select appropriate attribute for the node: best, best_ig = attribute_selection(data,labels); best_data = data[:,best]; best_word = word_data[best]; #remove all regarding that attribute from the data: word_data = np.delete(word_data,best,0); left_data = np.delete(data[best_data == 0,:],best,1); right_data = np.delete(data[best_data == 1,:],best,1); #divide labels into two subarray based on selected attribute: left_labl = labels[best_data == 0]; right_labl = labels[best_data == 1]; if (check_label(left_labl) == 2 and level != 0): #since label is mono-valued: left = LabelConv[left_labl[0]-1]; else: left = build_tree(left_data,left_labl,word_data,level-1); if (check_label(right_labl) == 2 and level != 0): #since label is mono-valued: right = LabelConv[right_labl[0]-1]; else: right = build_tree(right_data,right_labl,word_data,level-1); subtrees = {0: left, 1: right}; return (best_word,best_ig,subtrees);
# Add a new column to the existing DataFrame with the encoded values df[LABEL] = le.fit_transform(df['label'].values.ravel()) RANDOM_SEED = 50 N_TIME_STEPS = 200 N_FEATURES = 2 classes = 4 step = 1 segments = [] labels = [] for i in range(1, len(df) - N_TIME_STEPS, step): x1 = df['value a'].values[i:i + N_TIME_STEPS] x2 = df['value b'].values[i:i + N_TIME_STEPS] label = stats.mode(df['label'][i:i + N_TIME_STEPS])[0][0] segments.append([x1, x2]) labels.append(label) reshaped_segments = np.asarray(segments, dtype=np.float32).reshape( -1, N_TIME_STEPS, N_FEATURES) labels = np.asarray(pd.get_dummies(labels), dtype=np.float32) X_train, X_test, y_train, y_test = train_test_split(reshaped_segments, labels, test_size=0.2, random_state=RANDOM_SEED) print('x_train shape: ', X_train.shape) print(X_train.shape[0], 'training samples') print('y_train shape: ', y_train.shape)
combined_data['Item_Visibility_MeanRatio'] = combined_data.apply(lambda x: x[ 'Item_Visibility'] / item_visibility_mean.loc[x['Item_Identifier']], axis=1) # step 5 # For Year_Of_Establishment add new variable NoOfYears. And drop Year_Of_Establishment. combined_data[ 'Outlet_Years'] = 2018 - combined_data['Outlet_Establishment_Year'] # step 6 # Find missing values for outlet size by mode. outlet_size_mode = combined_data.dropna(subset=["Outlet_Size"]).pivot_table( values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: mode(x).mode[0])) miss_bool = combined_data['Outlet_Size'].isnull() sum(miss_bool) combined_data.loc[miss_bool, 'Outlet_Size'] = combined_data.loc[ miss_bool, 'Outlet_Type'].apply(lambda x: outlet_size_mode[x]) sum(combined_data['Outlet_Size'].isnull()) # step 7 # Item Type handling. combined_data['Item_Type_Combined'] = combined_data['Item_Identifier'].apply( lambda x: x[0:2]) combined_data['Item_Type_Combined'] = combined_data['Item_Type_Combined'].map({ 'FD': 'Food', 'DR': 'Drinks',
def levelPlot(data, var=None, time=None, levels=(3, 5), target=None, colors=None, **kwargs): """ Draw a step-plot with up to 5 levels following a color cycle (e.g. Kp index "stoplight") Parameters ---------- data : array-like, or dict-like Data for plotting. If dict-like, the key providing an array-like to plot must be given to var keyword argument. Other Parameters ---------------- var : string Name of key in dict-like input that contains data time : array-like or string Name of key in dict-like that contains time, or arraylike of datetimes levels : array-like, up to 5 levels Breaks between levels in data that should be shown as distinct colors target : figure or axes Target axes or figure window colors : array-like Colors to use for the color sequence (if insufficient colors, will use as a cycle) **kwargs : other keywords Other keywords to pass to spacepy.toolbox.binHisto Returns ------- binned : tuple Tuple of the binned data and bins Examples -------- >>> import spacepy.plot as splot >>> import spacepy.time as spt >>> import spacepy.omni as om >>> tt = spt.tickrange('2012/09/28','2012/10/2', 3/24.) >>> omni = om.get_omni(tt) >>> splot.levelPlot(omni, var='Kp', time='UTC', colors=['seagreen', 'orange', 'crimson']) """ #assume dict-like/key-access, before moving to array-like if var is not None: try: usearr = data[var] except KeyError: raise KeyError('Key "{1}" not present in data'.format(var)) else: #var is None, so make sure we don't have a dict-like if not isinstance(data, Mapping): usearr = np.asarray(data) else: raise TypeError( 'Data appears to be dict-like without a key being given') tflag = False if time is not None: from scipy.stats import mode try: times = data[time] except (KeyError, ValueError, IndexError): times = time try: times = matplotlib.dates.date2num(times) tflag = True except AttributeError: #the x-data are a non-datetime times = np.asarray(time) #now add the end-point stepsize, dum = mode(np.diff(times), axis=None) times = np.hstack([times, times[-1] + stepsize]) else: times = np.asarray(range(0, len(usearr) + 1)) if not colors: if len(levels) <= 3: #traffic light colours that are distinct to protanopes and deuteranopes colors = ['lime', 'yellow', 'crimson', 'saddlebrown'] else: colors = matplotlib.rcParams['axes.color_cycle'] else: try: assert len(colors) > len(levels) except AssertionError: #cycle the given colors, if not enough are given colors = list(colors) * int(1 + len(levels) / len(colors)) if 'alpha' not in kwargs: kwargs['alpha'] = 0.75 if 'legend' not in kwargs: legend = False else: legend = kwargs['legend'] del kwargs['legend'] fig, ax = set_target(target) subset = np.asarray(dmcopy(usearr)) def fill_between_steps(ax, x, y1, **kwargs): y2 = np.zeros_like(y1) stepsxx = x.repeat(2)[1:-1] stepsyy = y1.repeat(2) y2 = np.zeros_like(stepsyy) ax.fill_between(stepsxx, stepsyy, y2, **kwargs) if mpl.__version__ < '1.5.0': #pre-v1.5.0, need to manually add an artist for the legend p = plt.Rectangle((0, 0), 0, 0, **kwargs) ax.add_patch(p) #below threshold 1 idx = 0 inds = usearr > levels[0] subset[inds] = np.nan kwargs['label'] = u'≤{0}'.format(levels[idx]) fill_between_steps(ax, times, subset, color=colors[0], zorder=30, **kwargs) #for each of the "between" thresholds for idx in range(1, len(levels)): subset = np.asarray(dmcopy(usearr)) inds = np.bitwise_or(usearr <= levels[idx - 1], usearr > levels[idx]) subset[inds] = np.nan kwargs['label'] = u'>{0},≤{1}'.format(levels[idx - 1], levels[idx]) fill_between_steps(ax, times, subset, color=colors[idx], zorder=30 - (idx * 2), **kwargs) #last idx += 1 try: inds = usearr <= levels[idx - 1] subset = np.asarray(dmcopy(usearr)) subset[inds] = np.nan kwargs['label'] = '>{0}'.format(levels[-1]) fill_between_steps(ax, times, subset, color=colors[idx], zorder=30 - (idx * 2), **kwargs) except: pass #if required, set x axis to times if tflag: try: applySmartTimeTicks(ax, data[time]) except (IndexError, KeyError): #using data array to index, so should just use time applySmartTimeTicks(ax, time) ax.grid(False, which='minor') #minor grid usually looks bad on these... if legend: ncols = len(levels) + 1 if ncols > 3: ncols = ncols // 2 ax.legend(loc='upper left', ncol=ncols) return ax
def OnevsOne(dataset, temp_indices, train_dataset, test_dataset, learning_parameter, num_classes, weights): y = (dataset[:, 7]).astype(int) new_models = [[] for i in range(num_classes)] X_train = train_dataset[:, :7] / np.max(train_dataset[:, :7], 0) y_train = train_dataset[:, 7].astype(int) X_test = test_dataset[:, :7] / np.max(test_dataset[:, :7], 0) y_actual = test_dataset[:, 7].astype(int) labels = np.unique(y).astype('str') new_models = [[] for i in range(int(num_classes * (num_classes - 1) / 2))] binary_class_models = [ [] for i in range(int(num_classes * (num_classes - 1) / 2)) ] binary_class_labels = [ [] for i in range(int(num_classes * (num_classes - 1) / 2)) ] for i in range(len(new_models)): new_models[i] = np.where(y == i + 1, 1, 0) i = 0 for p in range(1, num_classes): for q in range(p): binary_class_labels[i] = labels[q] + labels[p] binary_class_models[i] = np.vstack((new_models[q], new_models[p])) i += 1 binary_class_models = [model[1] for model in binary_class_models] new_models = binary_class_models predictions = [[] for i in range(num_classes)] probabilities = [[] for i in range(num_classes)] class_predictions = [[] for i in range(num_classes)] y_pred = [] for pred in range(len(predictions)): y_test = new_models[pred][temp_indices] y_train = np.delete(new_models[pred], temp_indices, axis=0) #print("Binary Class: ", binary_class_labels[pred]) weights = np.random.rand(7) for i in range(5000): weights = update(X_train, weights, y_train, learning_parameter) class_labels = list(map(int, binary_class_labels[pred])) probabilities[pred] = sigmoid( np.sum(np.multiply(X_test, weights), axis=1)) predictions[pred] = np.heaviside((probabilities[pred] - 0.5), 0).astype(int) #print("Accuracy for Class",binary_class_labels[pred],": ", accuracy_score(y_test, predictions[pred]),"\n") class_predictions[pred] = [ class_labels[label] for label in predictions[pred] ] y_pred = stats.mode(class_predictions)[0][-1] print("Individual Accuracy: ", accuracy_score(y_actual, y_pred), '\n') #print(confusion_matrix(y_actual, y_pred)) acc_score = accuracy_score(y_actual, y_pred) * len(temp_indices) return weights, acc_score
def calc_staff_font_info(df_0): needs_postprocessing = False for index_0 in range(len(df_0)): height, width = int(df_0[index_0]['height'].max()), int( df_0[index_0]['width'].max()) container_font_info = { 'pixel_mean': [], 'delta_line': [], 'pass_count': [], 'kind': [] } for index_1 in range(df_0[index_0].shape[0]): info = df_0[index_0].iloc[index_1:, :].copy() template = img[info['y'].values[0]:info['y'].values[0] + height, info['x'].values[0]:info['x'].values[0] + width] if width % 2 != 1: width = width - 1 if height % 2 != 1: height = height - 1 template_blr = cv2.GaussianBlur(template, (width, 1), 0) th, template_th = cv2.threshold( template_blr, int((np.mean(template_blr)) * 0.75), 255, cv2.THRESH_BINARY_INV) for index_2 in range(2): template_open = cv2.morphologyEx(template_th, cv2.MORPH_OPEN, np.ones((1, 75), np.uint8)) template_close = cv2.morphologyEx(template_open, cv2.MORPH_CLOSE, np.ones((5, 1), np.uint8)) template_th = template_close # cv2.imshow('template',template_th) # cv2.waitKey(0) # cv2.destroyAllWindows() # cv2.imshow('template_open',template_open) # cv2.waitKey(0) # cv2.destroyAllWindows() # # cv2.imshow('template_closed',template_close) # cv2.waitKey(0) # cv2.destroyAllWindows() df_1 = pd.DataFrame( data={ 'row_0': template_close[:, 0].copy(), 'row_1': template_close[:, -1].copy() }) df_1 = df_1.divide(2) df_1['sum'] = df_1['row_0'].add(df_1['row_1']) df_1 = df_1.loc[df_1['sum'] > 200] df_1['numrow'] = df_1.index.tolist() df_1['delta_p'] = df_1['numrow'].diff().shift(-1).fillna(2) # print(df_1) df_1 = df_1.loc[df_1['delta_p'] > 5] df_1 = df_1.reset_index(drop=True) for index_2 in range(df_1.shape[0]): if df_1['delta_p'].min() / df_1['delta_p'].mean() < 0.66: hold_value, hold_index = df_1['delta_p'].min( ), df_1['delta_p'].idxmin() if hold_index != 0 and hold_index != df_1.index.tolist( )[-1]: val_0, val_1 = df_1['delta_p'].values[( hold_index - 1)], df_1['delta_p'].values[(hold_index + 1)] if val_0 < val_1: hold_index = hold_index - 1 else: hold_index = hold_index + 1 elif hold_index == 0: hold_index = 1 elif hold_index == df_1.index.tolist()[-1]: hold_index = hold_index - 1 if np.abs(hold_value + df_1['delta_p'].values[hold_index] - df_1['delta_p'].max()) < 6: df_1.loc[df_1['numrow'] == df_1['numrow'].values[hold_index], 'delta_p'] = df_1['delta_p'].values[ hold_index] + hold_value df_1 = df_1.loc[df_1['delta_p'] > df_1['delta_p'].min()] df_1 = df_1.reset_index(drop=True) else: break df_1['delta_line'] = df_1['delta_p'].tolist() template_a = img[info['y'].values[0]:int(info['y'].values[0] + template.shape[0]), info['x'].values[0]:int(info['x'].values[0] + info['width'].values[0])] template_blr_a = cv2.GaussianBlur(template_a, (9, 1), 0) th, template_th_a = cv2.threshold( template_blr_a, int((np.min(template_a) + (255 - np.mean(template_blr_a))) * 1.3), 255, cv2.THRESH_BINARY_INV) container_font_info['pixel_mean'].append(np.mean(template_th_a)) if np.abs(df_1['delta_line'].mean() - df_1['delta_line'].mode()[0]) > 2.5: container_font_info['delta_line'].append( df_1['delta_line'].tolist()) needs_postprocessing = True else: container_font_info['delta_line'].append( int(df_1['delta_line'].mean())) df_0[index_0]['delta_line'] = container_font_info['delta_line'] df_0[index_0]['pixel_mean'] = container_font_info['pixel_mean'] container_postprocessing_index = [] if needs_postprocessing == True: for index_0 in range(len(df_0)): df_2 = df_0[index_0].copy() df_2 = df_2.reset_index(drop=True) for index, row in df_2.iterrows(): if type(row['delta_line']) == list: container_postprocessing_index.append( [row['pass_count'], index, index_0, []]) for data in container_postprocessing_index: df_3 = df_0[data[2]].loc[df_0[data[2]]['pass_count'] == data[0]].copy() for df_temp in df_0: df_4 = df_temp.loc[df_temp['pass_count'] == data[0]].copy() for val in df_4['delta_line'].values: if type(val) != list: data[3].append(val) if len(data[3]) > 0: df_0[data[2]]['delta_line'].values[data[1]] = np.mean(data[3]) else: df_0[data[2]]['delta_line'].values[data[1]] = -1 mean = 0 for index_0 in range(len(df_0)): mean = (mean + df_0[index_0]['delta_line'].mean()) mean = mean / len(df_0) for index_0 in range(len(df_0)): df_0[index_0].loc[(df_0[index_0]['delta_line'] < 0), 'delta_line'] = mean container_delta_line = np.array([], dtype=np.uint8) for df_temp in df_0: container_delta_line = np.append(container_delta_line, df_temp['delta_line'].values) if container_delta_line.max() != container_delta_line.min( ) and container_delta_line.max() - container_delta_line.min() < 6: for df_temp in df_0: df_temp['delta_line'] = [stats.mode(container_delta_line)[0][0] ] * df_temp.shape[0] for df_temp in df_0: df_temp['font_scaling'] = df_temp['delta_line'].divide( container_delta_line.mean()) return df_0
def __init__(self, labelimg_list, brainimg, bounding_boxes=None, beta=-.2, mixing_ratio=10, patch_length=5, same_threshold=True, thresholds=[0.8, 0.6]): #use same defaults as the parser def positive_int(x): #avoid nonsense negative parameter values x = int(x) if x < 0: raise AssertionError("%r is not a positive int"%(x,)) return x def restricted_float(x): #avoid nonsense values for the threshold x = float(x) if x < 0.0 or x > 1.0: raise AssertionError("%r not in range [0.0, 1.0]"%(x,)) return x #catch invalid parameters self.beta = float(beta) self.mixing_ratio = positive_int(mixing_ratio) self.patch_length = positive_int(patch_length) for threshold in thresholds: threshold = restricted_float(threshold) self.bounding_box = [] #get the final bounding box for AWoL-MRF self.bounding_box.append(np.amin(bounding_boxes[:, :3], axis=0) - self.patch_length) #min indices self.bounding_box.append(np.amax(bounding_boxes[:, :3] + bounding_boxes[:, 3:], axis=0) + self.patch_length) self.bounding_box.append(self.bounding_box[1] - self.bounding_box[0]) #dimensions #get the bounded label arrays with bounding boxes volhandles = [] nimg = len(labelimg_list) for n, img in enumerate(labelimg_list): xmin = bounding_boxes[n][0] - self.bounding_box[0][0] #find the bounded indices for this image ymin = bounding_boxes[n][1] - self.bounding_box[0][1] zmin = bounding_boxes[n][2] - self.bounding_box[0][2] xmax = xmin + bounding_boxes[n][3] ymax = ymin + bounding_boxes[n][4] zmax = zmin + bounding_boxes[n][5] #get the bounded label array label_array = np.zeros((self.bounding_box[2][2], self.bounding_box[2][1], self.bounding_box[2][0])) label_array[zmin:zmax, ymin:ymax, xmin:xmax] = sitk.GetArrayFromImage(img) if len(volhandles) == 0: self.label_values = np.unique(label_array) #obtain the list of labels elif np.asarray(np.unique(label_array) != self.label_values).any(): #each image should have the same labels raise AssertionError("Labels in {0} not the same as in {1}.".format(img, labelimg_list[0])) volhandles.append(label_array) if len(self.label_values) != len(thresholds): if not(same_threshold): raise AssertionError("Number of labels does not match number of thresholds.") else: while len(thresholds) < len(self.label_values): thresholds.append(thresholds[-1]) #same threshold for each structural label self.mode = stats.mode(volhandles) #find the majority votes self.labels = np.zeros(volhandles[0].shape) - 1 #array of labels, -1 is for low-confidence voxels self.intensity = sitk.GetArrayFromImage(brainimg[self.bounding_box[0][0]:self.bounding_box[1][0], self.bounding_box[0][1]:self.bounding_box[1][1], self.bounding_box[0][2]:self.bounding_box[1][2]]) #find the high-confidence voxels for each label for i, l in enumerate(self.label_values.tolist()): above_threshold = np.where((self.mode[0][0] == l) & (self.mode[1][0] >= thresholds[i]*nimg)) below_threshold = np.where((self.mode[0][0] == l) & (self.mode[1][0] < thresholds[i]*nimg)) print(above_threshold[0].shape, below_threshold[0].shape) #threshold reduction if necessary while below_threshold[0].shape > above_threshold[0].shape and thresholds[i] > .55: thresholds[i] -= .05 above_threshold = np.where((self.mode[0][0] == l) & (self.mode[1][0] >= thresholds[i]*nimg)) below_threshold = np.where((self.mode[0][0] == l) & (self.mode[1][0] < thresholds[i]*nimg)) self.labels[above_threshold] = l self.brainimg = brainimg #keep this to copy the metadata to the output image
def identity(self): """Return the average predicted identity of all Tracklet detections.""" try: return mode(self.data[..., 3], axis=None, nan_policy='omit')[0][0] except IndexError: return -1
def computePercentOfChangeDistributionForAllNamads( OutputDir="Distiribution", InputFile="AllNamadsByNamads.pkl"): if not os.path.exists(OutputDir): os.makedirs(OutputDir) f = open(InputFile, "rb") allData = pickle.load(f) f.close() print('start writing results for ' + str(allData.__len__()) + ' Namad') pr = 0 GroupByNamad = {} for Namad in allData: NamadData = allData[Namad] if Namad not in GroupByNamad: GroupByNamad[Namad] = [] GroupByMonth = {} for val in NamadData: DayData = NamadData[val] day = DayData['تاريخ'] key = f'{day.year:02}' + '-' + f'{day.month:02}' if key not in GroupByMonth: GroupByMonth[key] = [] try: Name = DayData['نام'] except KeyError: Name = Namad # Value = DayData['ارزش'] # Volume = DayData['حجم'] Maximum = DayData['بیشترین'] Minimum = DayData['کمترین'] ExchangeCount = DayData['دفعات معامله'] ClosePrice = DayData['مقدار قیمت پایانی'] # taqirqeymatpayani = DayData[ # 'تغییر قیمت پایانی'] # «قیمت پایانی» برابر با میانگین وزنی قیمتهای معاملهشده در همان روز است. PercentOfClosePrice = DayData[ 'درصد قیمت پایانی'] # میانگین قیمت سهم در روز LastPrice = DayData[ 'مقدار آخرین قیمت'] # «قیمت آخرین معامله» برابر است با آخرین قیمتی که تا آن لحظه معامله شده است. # taqirakharinqeymat = DayData['تغییر آخرین قیمت'] PercentOfLastPrice = DayData[ 'درصد آخرین قیمت'] # آخرین قیمت معامله شده # PriceOfPreDay = DayData['قیمت روز قبل'] ValueOfBazzar = DayData['ارزش بازار'] # ارزش کل سهام های نماد GroupByMonth[key].append(float(PercentOfClosePrice)) HistOfMonth = {} for m in sorted(GroupByMonth.keys()): a = np.asarray(GroupByMonth[m]) hist, bin_edges = np.histogram(a, density=True) # _counts = Counter(a) # plt.hist(a, 'auto) # reshaped_text = arabic_reshaper.reshape(Namad) # text = bidi.algorithm.get_display(reshaped_text) # plt.suptitle(text + ' > '+str(m)) # plt.show() average = np.average(a) median = np.median(a) mode = stats.mode(a) std = np.std(a) HistOfMonth[m] = { 'hist': hist, 'bin': bin_edges, 'avg': average, 'median': median, 'mode': mode, 'std': std } GroupByNamad[Namad] = HistOfMonth f = open(OutputDir + '/PercentOfChangeDistributionForAllNamads.pkl', "wb") pickle.dump(allData, f) f.close()
def computePercentOfChangeDistributionForAllNamadsAsWhole( OutputDir="Distiribution", InputFile="AllData.pkl"): if not os.path.exists(OutputDir): os.makedirs(OutputDir) f = open(InputFile, "rb") allData = pickle.load(f) f.close() print('start writing results for ' + str(allData.__len__()) + ' day') GroupByMonth = {} for day in allData: DayData = allData[day] key = f'{day.year:02}' + '-' + f'{day.month:02}' if key not in GroupByMonth: GroupByMonth[key] = [] for Namad in DayData: NamadData = DayData[Namad] try: Name = NamadData['نام'] except KeyError: Name = day # Value = NamadData['ارزش'] # Volume = NamadData['حجم'] Maximum = NamadData['بیشترین'] Minimum = NamadData['کمترین'] ExchangeCount = NamadData['دفعات معامله'] ClosePrice = NamadData['مقدار قیمت پایانی'] # taqirqeymatpayani = DayData[ # 'تغییر قیمت پایانی'] # «قیمت پایانی» برابر با میانگین وزنی قیمتهای معاملهشده در همان روز است. PercentOfClosePrice = NamadData[ 'درصد قیمت پایانی'] # میانگین قیمت سهم در روز LastPrice = NamadData[ 'مقدار آخرین قیمت'] # «قیمت آخرین معامله» برابر است با آخرین قیمتی که تا آن لحظه معامله شده است. # taqirakharinqeymat = NamadData['تغییر آخرین قیمت'] PercentOfLastPrice = NamadData[ 'درصد آخرین قیمت'] # آخرین قیمت معامله شده # PriceOfPreDay = NamadData['قیمت روز قبل'] ValueOfBazzar = NamadData['ارزش بازار'] # ارزش کل سهام های نماد GroupByMonth[key].append(float(PercentOfClosePrice)) HistOfMonth = {} for m in sorted(GroupByMonth.keys()): a = np.asarray(GroupByMonth[m]) hist, bin_edges = np.histogram(a, density=True) # _counts = Counter(a) # plt.hist(a, bin_edges) # plt.suptitle(str(m)) # plt.show() HistOfMonth[m] = { 'hist': hist, 'bin': bin_edges, 'avg': np.average(a), 'median': np.median(a), 'mode': stats.mode(a) } f = open(OutputDir + '/PercentOfChangeDistributionForAllNamadsAsWhole.pkl', "wb") pickle.dump(allData, f) f.close()
def dmg_seed_50_1D(colnum): #INITIALIZING STUFF Nmitral = 50 Ngranule = np.copy(Nmitral) #number of granule cells pg. 383 of Li/Hop Ndim = Nmitral + Ngranule #total number of cells # t_inh = 25 ; # time when inhalation starts # t_exh = 205; #time when exhalation starts # Ndamagetotal = Nmitral*2 + 1 #number of damage steps Ndamage = 3 #steps to reduce entire matrix to zero Ncols = int(Nmitral / 2) #define number of columns to damage finalt = 395 # end time of the cycle #y = zeros(ndim,1); P_odor0 = np.zeros((Nmitral, 1)) #odor pattern, no odor P_odor1 = P_odor0 + .00429 #Odor pattern 1 # P_odor2 = 1/70*np.array([.6,.5,.5,.5,.3,.6,.4,.5,.5,.5]) # P_odor3 = 4/700*np.array([.7,.8,.5,1.2,.7,1.2,.8,.7,.8,.8]) #control_odor = control_order + .00429 #control_odor = np.zeros((Nmitral,1)) #odor input for adaptation #controllevel = 1 #1 is full adaptation H0 = np.zeros((Nmitral, Ngranule)) #weight matrix: to mitral from granule W0 = np.zeros((Ngranule, Nmitral)) #weights: to granule from mitral H0 = np.load('H0_50_53Hz.npy') #load weight matrix W0 = np.load('W0_50_53Hz.npy') #load weight matrix #H0 = H0 + H0*np.random.rand(np.shape(H0)) #W0 = W0+W0*np.random.rand(np.shape(W0)) M = 5 #average over 5 trials for each level of damage #initialize iterative variables d1it, d2it, d3it, d4it = np.zeros(M), np.zeros(M), np.zeros(M), np.zeros(M) IPRit, IPR2it, pnit = np.zeros(M), np.zeros(M), np.zeros(M) frequencyit = np.zeros(M) pwrit = np.zeros(M) yout2, Sh2 = np.zeros((finalt, Ndim)), np.zeros((finalt, Ndim)) psi = np.copy(Sh2[:, :Nmitral]) #initialize quantities to be returned at end of the process dmgpct1 = np.zeros(Ncols * (Ndamage - 1) + 1) eigfreq1 = np.zeros(Ncols * (Ndamage - 1) + 1) d11 = np.zeros(Ncols * (Ndamage - 1) + 1) d21 = np.zeros(Ncols * (Ndamage - 1) + 1) d31 = np.zeros(Ncols * (Ndamage - 1) + 1) d41 = np.zeros(Ncols * (Ndamage - 1) + 1) pwr1 = np.zeros(Ncols * (Ndamage - 1) + 1) IPR1 = np.zeros(Ncols * (Ndamage - 1) + 1) IPR2 = np.zeros(Ncols * (Ndamage - 1) + 1) pn1 = np.zeros(Ncols * (Ndamage - 1) + 1) freq1 = np.zeros(Ncols * (Ndamage - 1) + 1) cell_act = np.zeros((finalt, Ndim, Ncols * (Ndamage - 1) + 1)) damage = 0 dam = np.ones(Nmitral) #Get the base response first Omean1,Oosci1,Omeanbar1,Ooscibar1 = np.zeros((Nmitral,M))+0j,\ np.zeros((Nmitral,M))+0j,np.zeros(M)+0j,np.zeros(M)+0j for m in np.arange(M): yout,y0out,Sh,t,OsciAmp1,Omean1[:,m],Oosci1[:,m],Omeanbar1[m],\ Ooscibar1[m],freq0,maxlam = olf_bulb_10(Nmitral,H0,W0,P_odor1,dam) counter = 0 #to get the right index for each of the measures damage = 0 dam[colnum] += .5 # so that first run is for zero damage for col in range(Ncols): cols = int(np.mod(colnum + col, Nmitral)) for lv in np.arange(Ndamage): #reinitialize all iterative variables to zero (really only need to do for distance measures, but good habit) d1it, d2it, d3it, d4it = np.zeros(M), np.zeros(M), np.zeros( M), np.zeros(M) IPRit, IPR2it, pnit = np.zeros(M), np.zeros(M), np.zeros(M) frequencyit = np.zeros(M) pwrit = np.zeros(M) if not ( lv == 0 and cols != colnum ): #if it's the 0th level for any but the original col, skip dam[cols] = dam[cols] - .5 dam[dam < 1e-10] = 0 damage = np.sum(1 - dam) for m in np.arange(M): #Then get respons of damaged network yout2[:,:],y0out2,Sh2[:,:],t2,OsciAmp2,Omean2,Oosci2,Omeanbar2,\ Ooscibar2,freq2,grow_eigs2 = olf_bulb_10(Nmitral,H0,W0,P_odor1,dam) #calculate distance measures print(time.time() - tm1) for i in np.arange(M): d1it[m] += 1 - Omean1[:, m].dot(Omean2) / ( lin.norm(Omean1[:, m]) * lin.norm(Omean2)) d2it[m] += 1 - lin.norm(Oosci1[:, m].dot( np.conjugate(Oosci2))) / (lin.norm(Oosci1[:, m]) * lin.norm(Oosci2)) d3it[m] += (Omeanbar1[m] - Omeanbar2) / (Omeanbar1[m] + Omeanbar2) d4it[m] += np.real((Ooscibar1[m] - Ooscibar2) / (Ooscibar1[m] + Ooscibar2)) d1it[m] = d1it[ m] / M #average over comparison with all control trials d2it[m] = d2it[m] / M d3it[m] = d3it[m] / M d4it[m] = d4it[m] / M #calculate spectral density and "wave function" to get average power and IPR P_den = np.zeros( (501, Nmitral)) #only calculate the spectral density from for i in np.arange( Nmitral ): #t=125 to t=250, during the main oscillations f, P_den[:, i] = signal.periodogram(Sh2[125:250, i], nfft=1000, fs=1000) psi = np.zeros(Nmitral) for p in np.arange(Nmitral): psi[p] = np.sum(P_den[:, p]) psi = psi / np.sqrt(np.sum(psi**2)) psi2 = np.copy(OsciAmp2) psi2 = psi2 / np.sqrt(np.sum(psi2**2)) maxAmp = np.max(OsciAmp2) pnit[m] = len(OsciAmp2[OsciAmp2 > maxAmp / 2]) IPRit[m] = 1 / np.sum(psi**4) IPR2it[m] = 1 / np.sum(psi2**4) pwrit[m] = np.sum(P_den) / Nmitral #get the frequency according to the adiabatic analysis maxargs = np.argmax(P_den, axis=0) argf = stats.mode(maxargs[maxargs != 0]) frequencyit[m] = f[argf[0][0]] # print(cols) # print(time.time()-tm1) # # print('level',lv) #Get the returned variables for each level of damage dmgpct1[counter] = damage / Nmitral IPR1[counter] = np.average(IPRit) #Had to do 1D list, so pwr1[counter] = np.average( pwrit) #it goes column 0 damage counterl freq1[counter] = np.average( frequencyit) #0,1,2,3,4...Ndamage-1, then #col 1 damage level 0,1,2... # IPRsd[counter]=np.std(IPRit) # pwrsd[counter]=np.std(pwrit) # freqsd[counter]=np.std(frequencyit) IPR2[counter] = np.average(IPR2it) pn1[counter] = np.average(pnit) d11[counter] = np.average(d1it) d21[counter] = np.average(d2it) d31[counter] = np.average(d3it) d41[counter] = np.average(d4it) # d1sd[counter] = np.std(d1it) # d2sd[counter] = np.std(d2it) # d3sd[counter]=np.std(d3it) # d4sd[counter]=np.std(d4it) eigfreq1[counter] = np.copy(freq2) if (colnum == 0 or colnum == int(Nmitral / 2)): cell_act[:, :, counter] = np.copy(yout2) counter += 1 return dmgpct1, eigfreq1, d11, d21, d31, d41, pwr1, IPR1, IPR2, pn1, freq1, cell_act
nnArr = [] # Disregard null values for p in dataset.Price: if math.isnan(p): pass else: nnArr.append(int(p)) # Calculate mean mean = numpy.mean(nnArr) # Calculate median median = numpy.median(nnArr) # Calculate mode mode = stats.mode(nnArr, axis=None) #---------------------------------- #Variability - Range statrange = numpy.ptp(nnArr) #Variability - Interquartile Range q3, q1 = numpy.percentile(nnArr, [75, 25]) iqr = q3 - q1 #Variability - Variance variance = statistics.variance(nnArr) #Variance - Standard Deviation stddeviation = numpy.std(nnArr, ddof=1) #Print values print("Mean: {}".format(mean))
# 1st Solution import numpy as np from scipy import stats size = int(input()) numbers = list(map(int, input().split())) print(np.mean(numbers)) print(np.median(numbers)) print(int(stats.mode(numbers)[0])) # 2nd Solution import operator input() n = list(map(int, input().split())) w = list(map(int, input().split())) print('{0:.1f}'.format(sum(map(operator.mul, n, w))/sum(w)))
'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'poolcnt', 'pooltypeid7', 'propertycountylandusecode', 'propertylandusetypeid', 'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'censustractandblock'] for column in column_names2: print("% of null values",column," = ",(pd.isnull(properties1[column]).sum()/2985217)*100) df = train.merge(properties1, how = 'left',on = 'parcelid') from scipy.stats import mode df['buildingqualitytypeid'].fillna(mode(df['buildingqualitytypeid']).mode[0],inplace = True) df['calculatedbathnbr'].fillna(mode(df['calculatedbathnbr']).mode[0],inplace = True) df['calculatedfinishedsquarefeet'].fillna(mode(df['calculatedfinishedsquarefeet']).mode[0],inplace = True) df['finishedsquarefeet12'].fillna(mode(df['finishedsquarefeet12']).mode[0],inplace = True) df['fullbathcnt'].fillna(mode(df['fullbathcnt']).mode[0],inplace = True) df['garagecarcnt'].fillna(mode(df['garagecarcnt']).mode[0],inplace = True) df['garagetotalsqft'].fillna(mode(df['garagetotalsqft']).mode[0],inplace = True) df['heatingorsystemtypeid'].fillna(mode(df['heatingorsystemtypeid']).mode[0],inplace = True) df['lotsizesquarefeet'].fillna(mode(df['lotsizesquarefeet']).mode[0],inplace = True) df['propertycountylandusecode'].fillna('0100', inplace = True) df['propertyzoningdesc'].fillna('LAR3',inplace = True) df['regionidcity'].fillna(mode(df['regionidcity']).mode[0],inplace = True) df.drop(['regionidneighborhood'],axis = 1, inplace = True) column_names2.remove('regionidneighborhood')
def get_statistics(atoms, SelectedHingeResidues, filename='Output'): """This sub-method is used to get the statistical data on the hinges and print it into a file. Notes: * * Function level: 1 (1 being top) * Do something about the output file Args: atoms ([packman.molecule.Atom]) : Set of atoms. (Read parent method description) SelectedHingeResidues ([packman.molecule.Residue]): Predicted hinge residues. filename (str, optional) : Output file name. Defaults to 'Output'. Returns: [p-value, stats] (float): p-value of the predicted hinge, statistics of the hinge (in that order) """ hinge_atoms = [i.get_backbone() for i in SelectedHingeResidues] hinge_atoms = [item for sublist in hinge_atoms for item in sublist] non_hinge_atoms = list(set([i for i in atoms]) - set(hinge_atoms)) all_atoms_bfactor = [i.get_bfactor() for i in atoms] hinge_atoms_bfactor = [i.get_bfactor() for i in hinge_atoms] non_hinge_atoms_bfactor = [ i.get_bfactor() for i in non_hinge_atoms ] return_stats = [] outputfile.write( '\nSTATISTICS\n\t\tN\tMin\tMax\tMean\tMode\tMedian\tSTDDev\n') return_stats.append( ['', 'N', 'Min', 'Max', 'Mean', 'Mode', 'Median', 'STDDev']) outputfile.write('Total ' + '\t' + str(len(all_atoms_bfactor)) + '\t' + str(numpy.min(all_atoms_bfactor)) + '\t' + str(numpy.max(all_atoms_bfactor)) + '\t' + str(numpy.mean(all_atoms_bfactor)) + '\t' + str(mode(all_atoms_bfactor)[0][0]) + '\t' + str(numpy.median(all_atoms_bfactor)) + '\t' + str(numpy.std(all_atoms_bfactor)) + '\n') return_stats.append([ 'Total', len(all_atoms_bfactor), numpy.min(all_atoms_bfactor), numpy.max(all_atoms_bfactor), numpy.mean(all_atoms_bfactor), mode(all_atoms_bfactor)[0][0], numpy.median(all_atoms_bfactor), numpy.std(all_atoms_bfactor) ]) outputfile.write('Hinge ' + '\t' + str(len(hinge_atoms_bfactor)) + '\t' + str(numpy.min(hinge_atoms_bfactor)) + '\t' + str(numpy.max(hinge_atoms_bfactor)) + '\t' + str(numpy.mean(hinge_atoms_bfactor)) + '\t' + str(mode(hinge_atoms_bfactor)[0][0]) + '\t' + str(numpy.median(hinge_atoms_bfactor)) + '\t' + str(numpy.std(hinge_atoms_bfactor)) + '\n') return_stats.append([ 'Hinge', len(hinge_atoms_bfactor), numpy.min(hinge_atoms_bfactor), numpy.max(hinge_atoms_bfactor), numpy.mean(hinge_atoms_bfactor), mode(hinge_atoms_bfactor)[0][0], numpy.median(hinge_atoms_bfactor), numpy.std(hinge_atoms_bfactor) ]) outputfile.write('NonHinge' + '\t' + str(len(non_hinge_atoms_bfactor)) + '\t' + str(numpy.min(non_hinge_atoms_bfactor)) + '\t' + str(numpy.max(non_hinge_atoms_bfactor)) + '\t' + str(numpy.mean(non_hinge_atoms_bfactor)) + '\t' + str(mode(non_hinge_atoms_bfactor)[0][0]) + '\t' + str(numpy.median(non_hinge_atoms_bfactor)) + '\t' + str(numpy.std(non_hinge_atoms_bfactor)) + '\n') return_stats.append([ 'NonHinge', len(non_hinge_atoms_bfactor), numpy.min(non_hinge_atoms_bfactor), numpy.max(non_hinge_atoms_bfactor), numpy.mean(non_hinge_atoms_bfactor), mode(non_hinge_atoms_bfactor)[0][0], numpy.median(non_hinge_atoms_bfactor), numpy.std(non_hinge_atoms_bfactor) ]) p_value = permutation_test(hinge_atoms_bfactor, non_hinge_atoms_bfactor, method='approximate', num_rounds=10000, seed=0) outputfile.write('\np-value:\t' + str(p_value) + '\n') return p_value, return_stats
def sum_org_images_of_type(self, messages, has_filters, type_name, dir_name, master_file_name=None): """Generates a summary for the images. Args: messages: List where the messages are added. has_filters: Indicates if the directories are organized by filters. type_name: Name of the type of image analyzed. dir_name: Directory to walk to search for images. master_file_name: Name of the master file, if any. """ messages.append(["> Summary for %s files." % (type_name)]) subdirectories, files, directories_from_root = \ self.walk_directories(self._target_dir, "*", dir_name, True) # Number of directories with data (from root). number_of_directories = len(directories_from_root) if has_filters: # Store a list of unique filters. Take all the paths and split them # to get the filter component, and add all to a set, that is converted # to a list. filters = list(set([s.split(os.sep)[-1] for s in subdirectories])) messages.append([ "Number of filters with %s files is: %d" % (type_name, len(filters)) ]) messages.append( ["Filters used by %s: %s" % (type_name, str(filters))]) # Get the list of directories found containing files. unique_paths = set([ff[PATH_COL] for ff in files]) # Summary: Number of unique directories. messages.append( ["Number of %s directories: %d" % (type_name, len(unique_paths))]) # The number of files in each directory is stored here. num_files_by_dir = [] # Number of mater files found in each directory, it is calculated only # num_master = 0 # Apply the following statistics if these files are used to create a mater # file (i.e. bias or flats). if master_file_name is not None: # Summary: Number of master files created. master = [fb for fb in files if fb[FILE_NAME_COL] == \ master_file_name] num_master = len(master) messages.append( ["Number of master %s: %d" % (type_name, num_master)]) # Number of directories with files and without master Important!). dir_without_master = [] for ubp in unique_paths: # Get the files of each directory. files_of_dir = [bf for bf in files if bf[PATH_COL] == ubp] # Get the master file of this directory if any. master_file = [bf for bf in files_of_dir \ if bf[FILE_NAME_COL] == master_file_name] # If this directory has not master. if len(master_file) == 0: dir_without_master.extend([ubp]) # The number of files in this directory is the total number # of files minus the master fits found. num_of_files = len(files_of_dir) - len(master_file) num_files_by_dir.extend([num_of_files]) messages.append([ "Directory: '%s' Number of files: %d" % (ubp, num_of_files) ]) # Summary: Number of directories with files and no master. messages.append([ "Number of directories with %s and no master %s: %d" % (type_name, type_name, len(dir_without_master)) ]) # If any directory has no master, show its path. if len(dir_without_master) > 0: messages.append([ "Directories without master %s : %s" % (type_name, str(dir_without_master)) ]) else: # Count the number of files in each directory. Now taking into # account names of files and its number instead of master files. for ubp in unique_paths: # Objects in the directory whose path matched those of unique set. all_objects_of_dir = [ f[FILE_NAME_COL] for f in files \ if f[PATH_COL] == ubp ] # Take as objects names those of FIT images, not final, and only # the part name that identifies the object. objects_of_dir = [ o[:o.find(DATANAME_CHAR_SEP)] for o in all_objects_of_dir \ if o.find("." + FIT_FILE_EXT) > 0 and o.find(DATA_FINAL_SUFFIX) < 0 ] # The number of files in this directory is the total number # of files minus the master fits found. num_files_by_dir.extend([len(objects_of_dir)]) messages.append([ "Directory: '%s' Number of files: %d" % (ubp, len(objects_of_dir)) ]) unique_objects_of_dir = set(objects_of_dir) for uo in unique_objects_of_dir: num_objs = len([o for o in objects_of_dir if o == uo]) messages.append( ["Object: '%s' Number of files: %d" % (uo, num_objs)]) # Create a set containing the root directories that contains files. # The source set contains a directory for each filter, so it may # contain several directories for each root directory. unique_root_dir_with_files = set( [x.split(os.sep)[1] for x in unique_paths]) # Summary: Number of directories without files. # The total number of minus the number of directories without files. num_dir_without_files = \ number_of_directories - len(unique_root_dir_with_files) messages.append([ "Number of directories without %s files: %d" % (type_name, num_dir_without_files) ]) # Summary: Number of files. num_files = sum(num_files_by_dir) messages.append(["Number of %s files is: %d" % (type_name, num_files)]) if len(num_files_by_dir) > 0: max_files_by_dir = max(num_files_by_dir) min_files_by_dir = min(num_files_by_dir) avg_files_by_dir = sum(num_files_by_dir) / len(num_files_by_dir) std_files_by_dir = np.std(num_files_by_dir) med_files_by_dir = np.median(num_files_by_dir) mode_files_by_dir = mode(num_files_by_dir)[0][0] else: max_files_by_dir = 0 min_files_by_dir = 0 avg_files_by_dir = 0 std_files_by_dir = 0 med_files_by_dir = 0 mode_files_by_dir = 0 # Summary: Maximum number of files in directories. messages.append([ "Maximum number of %s files in directories: %d" % (type_name, max_files_by_dir) ]) # Summary: Minimum number of files in directories. messages.append([ "Minimum number of %s files in directories: %d" % (type_name, min_files_by_dir) ]) # Summary: Average of number of files in directories. messages.append([ "Average of number of %s files in directories: %.10g" % (type_name, avg_files_by_dir) ]) # Summary: Standard deviation of number of files in directories. messages.append([ "Standard deviation of number of %s files in directories: %.10g" % (type_name, std_files_by_dir) ]) # Summary: Median of number of files in directories. messages.append([ "Median of number of %s files in directories: %.10g" % (type_name, med_files_by_dir) ]) # Summary: Mode of number of files in directories. messages.append([ "Mode of number of %s files in directories: %.10g" % (type_name, mode_files_by_dir) ])
incomes = np.random.normal(27000, 15000, 10000) #loc=150, scale=20, size=1000 print(type(incomes)) print(incomes.size) print(incomes) print(len(incomes)) print(incomes.ndim) print(incomes.shape) print(incomes.dtype) print("Mean value is: ", np.mean(incomes)) print("Median value is: ", np.median(incomes)) from scipy import stats print("Mode value is: ", stats.mode(incomes)[0]) print("Minimum value is: ", np.min(incomes)) print("Maximum value is: ", np.max(incomes)) print("Standard Deviation is: ", np.std(incomes)) #print("Correlation coefficient value is: ", np.corrcoef(incomes)) #We can segment the income data into 50 buckets, and plot it as a histogram: import matplotlib.pyplot as plt plt.hist(incomes, 20) plt.show() #box and whisker plot to show distribution #https://chartio.com/resources/tutorials/what-is-a-box-plot/ plt.boxplot(incomes) """
import pandas as pd import numpy as np from scipy.stats import mode from sklearn.pipeline import Pipeline from sklearn.preprocessing import Imputer from sklearn.ensemble import GradientBoostingClassifier from sklearn.grid_search import GridSearchCV df = pd.read_csv('train.csv') df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1) age_mean = df['Age'].mean() age_median = df['Age'].median() embarked_mode = mode(df['Embarked'])[0][0] df['Embarked'] = df['Embarked'].fillna(embarked_mode) df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int) df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1) df = df.drop(['Sex', 'Embarked'], axis=1) cols = df.columns.tolist() cols = [cols[1]] + cols[0:1] + cols[2:] df = df[cols] df = df.fillna(-1) train_data = df.values """
def _fraserMode(self, multi=0.1): y = num.array(self.data * multi).astype(int) mode = stats.mode(y)[0] w = num.where(y == mode) return num.median(self.data[w[0]])
def track(self, poses, identities=None): self.n_frames += 1 trackers = np.zeros((len(self.trackers), 6)) for i in range(len(trackers)): trackers[i, :5] = self.trackers[i].predict() empty = np.isnan(trackers).any(axis=1) trackers = trackers[~empty] for ind in np.flatnonzero(empty)[::-1]: self.trackers.pop(ind) ellipses = [] pred_ids = [] for i, pose in enumerate(poses): el = self.fitter.fit(pose) if el is not None: ellipses.append(el) if identities is not None: pred_ids.append(mode(identities[i])[0][0]) if not len(trackers): matches = np.empty((0, 2), dtype=int) unmatched_detections = np.arange(len(ellipses)) unmatched_trackers = np.empty((0, 6), dtype=int) else: ellipses_trackers = [Ellipse(*t[:5]) for t in trackers] cost_matrix = np.zeros((len(ellipses), len(ellipses_trackers))) for i, el in enumerate(ellipses): for j, el_track in enumerate(ellipses_trackers): cost = el.calc_similarity_with(el_track) if identities is not None: match = 2 if pred_ids[i] == self.trackers[j].id_ else 1 cost *= match cost_matrix[i, j] = cost row_indices, col_indices = linear_sum_assignment(cost_matrix, maximize=True) unmatched_detections = [ i for i, _ in enumerate(ellipses) if i not in row_indices ] unmatched_trackers = [ j for j, _ in enumerate(trackers) if j not in col_indices ] matches = [] for row, col in zip(row_indices, col_indices): val = cost_matrix[row, col] # diff = val - cost_matrix # diff[row, col] += val # if ( # val < self.iou_threshold # or np.any(diff[row] <= 0.2) # or np.any(diff[:, col] <= 0.2) # ): if val < self.iou_threshold: unmatched_detections.append(row) unmatched_trackers.append(col) else: matches.append([row, col]) if not len(matches): matches = np.empty((0, 2), dtype=int) else: matches = np.stack(matches) unmatched_trackers = np.asarray(unmatched_trackers) unmatched_detections = np.asarray(unmatched_detections) animalindex = [] for t, tracker in enumerate(self.trackers): if t not in unmatched_trackers: ind = matches[matches[:, 1] == t, 0][0] animalindex.append(ind) tracker.update(ellipses[ind].parameters) else: animalindex.append(-1) for i in unmatched_detections: trk = EllipseTracker(ellipses[i].parameters) if identities is not None: trk.id_ = mode(identities[i])[0][0] self.trackers.append(trk) animalindex.append(i) i = len(self.trackers) ret = [] for trk in reversed(self.trackers): d = trk.state if (trk.time_since_update < 1) and ( trk.hit_streak >= self.min_hits or self.n_frames <= self.min_hits ): ret.append( np.concatenate((d, [trk.id, int(animalindex[i - 1])])).reshape( 1, -1 ) ) # for DLC we also return the original animalid # +1 as MOT benchmark requires positive >> this is removed for DLC! i -= 1 # remove dead tracklet if trk.time_since_update > self.max_age: self.trackers.pop(i) if len(ret) > 0: return np.concatenate(ret) return np.empty((0, 7))
for i in range(len(x_test_num_avg)-1): x_test_num_concat = pd.concat([x_test_num_concat,x_test_num_avg[i+1][1]], axis=1) x_test_num_avg = x_test_num_concat.groupby(by=x_test_num_concat.columns, axis=1).apply(lambda g: g.mean(axis=1)) # Categorical Imputer imputer_txt = MultipleImputer(strategy='categorical', return_list=True, n=10, seed=101) x_train_txt_avg = imputer_txt.fit_transform(x_train_txt) x_train_txt_col = list(x_train_txt.columns) x_train_txt_col.sort() x_train_txt_concat = x_train_txt_avg[0][1] for i in range(len(x_train_txt_avg)-1): x_train_txt_concat = pd.concat([x_train_txt_concat, x_train_txt_avg[i+1][1]], axis=1) x_train_txt_avg = x_train_txt_concat.groupby(by=x_train_txt_concat.columns, axis=1).apply(lambda g: stats.mode(g, axis=1)[0]) x_train_txt_avg = x_train_txt_avg.sort_index(axis=0) x_train_txt_avg_temp = pd.DataFrame(x_train_txt_avg[0]) for i in range(len(x_train_txt_avg)-1): x_train_txt_avg_temp = pd.concat([x_train_txt_avg_temp,pd.DataFrame(x_train_txt_avg[i+1])], axis=1) x_train_txt_avg_temp.columns = x_train_txt_col x_train_txt_avg = x_train_txt_avg_temp x_train_txt = x_train_txt.sort_index(axis=1) x_test_txt_avg = imputer_txt.fit_transform(x_test_txt) x_test_txt_col = list(x_test_txt.columns) x_test_txt_col.sort() x_test_txt_concat = x_test_txt_avg[0][1]
def app(): global input_df st.title('Home') st.write( 'First adjust the backtest parameters on the left, then launch the backtest by pressing the button below.' ) st.sidebar.header("Backtest parameters") with st.form("input_params"): session_state.startdate = st.sidebar.date_input( 'start date', value=session_state.startdate, min_value=datetime.strptime('1900-01-01', '%Y-%m-%d'), max_value=date.today(), key='startdate', help='start date of the backtest') session_state.enddate = st.sidebar.date_input( 'end date', value=session_state.enddate, min_value=datetime.strptime('1900-01-01', '%Y-%m-%d'), max_value=date.today(), key='enddate', help='end date of the backtest') session_state.initial_cash = st.sidebar.number_input( "initial cash", min_value=0.0, max_value=None, value=session_state.initial_cash, step=1000.0, format='%f', key='initial_cash', help='initial cash') session_state.contribution = st.sidebar.number_input( "contribution or withdrawal", min_value=None, max_value=None, value=session_state.contribution, format='%f', step=0.01, key='contribution', help= 'contribution or withdrawal. Can be specified as % of the portfolio value or in absolute terms.' ) session_state.leverage = st.sidebar.number_input( "leverage", min_value=1.0, max_value=None, step=0.01, value=session_state.leverage, format='%f', key='leverage', help='daily leverage to apply to assets returns') session_state.expense_ratio = st.sidebar.number_input( "expense ratio", min_value=0.0, max_value=1.0, step=0.01, value=session_state.expense_ratio, format='%f', key='expense_ratio', help='annual expense ratio') st.sidebar.subheader("Assets") if session_state.historic == "Yahoo Finance (daily prices)": idx = 0 elif session_state.historic == "Historical DB (daily prices)": idx = 1 else: idx = 2 session_state.historic = st.sidebar.radio( 'data source', ("Yahoo Finance (daily prices)", "Historical DB (daily prices)", "Historical DB (yearly prices)"), index=idx, key='historic', help='choose the data source') if session_state.historic == "Yahoo Finance (daily prices)": historic_cd = None elif session_state.historic == "Historical DB (daily prices)": historic_cd = 'medium' elif session_state.historic == "Historical DB (yearly prices)": historic_cd = 'long' session_state.shares = st.sidebar.text_area( "assets to backtest", value=session_state.shares, height=None, max_chars=None, key="shares", help='tickers in a comma separated list (e.g. "SPY,TLT,GLD")') session_state.shareclass = st.sidebar.text_area( "assets class (for Yahoo Finance only)", value=session_state.shareclass, height=None, max_chars=None, key="shareclass", help= 'class of each asset (e.g. `equity,bond_lt,gold`). Possibilities are `equity, bond_lt, bond_it, gold, commodity`, where "bond_lt" and "bond_it" are long and intermediate duration bonds, respectively. __This argument is mandatory when the data source is Yahoo Finance.' ) session_state.weights = st.sidebar.text_area( "asset weights", value=session_state.weights, height=None, max_chars=None, key="weights", help= 'list of portfolio weights for each asset specified (e.g. `0.35,0.35,0.30`). The weights need to sum to 1. When weights are specified a custom weights strategy is used that simply loads the weights specified. Alternative is to use a pre-defined strategy.' ) session_state.benchmark = st.sidebar.text_input( "benchmark", value=session_state.benchmark, max_chars=None, key='benchmark', help='ticker of a benchmark') session_state.indicator = st.sidebar.checkbox( "signal assets", value=session_state.indicator, key='indicators', help='load the signal assets needed for the rotation strategy') st.sidebar.subheader("Strategies") session_state.riskparity = st.sidebar.checkbox( 'risk parity', value=session_state.riskparity, key='riskparity', help= 'Dynamic allocation of weights according to the risk parity methodology (see https://thequantmba.wordpress.com/2016/12/14/risk-parityrisk-budgeting-portfolio-in-python/). Here the risk parity is run at portfolio level.' ) session_state.riskparity_nested = st.sidebar.checkbox( 'risk parity nested', value=session_state.riskparity_nested, key='riskparity_nested', help= 'Dynamic allocation of weights according to the risk parity methodology (see https://thequantmba.wordpress.com/2016/12/14/risk-parityrisk-budgeting-portfolio-in-python/). Here the risk parity is run first at asset classe level (for assets belonging to the same asset class) and then at portfolio level.' ) session_state.rotationstrat = st.sidebar.checkbox( 'asset rotation', value=session_state.rotationstrat, key='rotationstrat', help= 'Asset rotation strategy that buy either gold, bonds or equities based on a signal (see https://seekingalpha.com/article/4283733-simple-rules-based-asset-rotation-strategy). To use this strategy tick the box signal assets.' ) session_state.uniform = st.sidebar.checkbox( 'uniform', value=session_state.uniform, key='uniform', help= 'Static allocation uniform across asset classes. Assets are allocated uniformly within the same asset class.' ) session_state.vanillariskparity = st.sidebar.checkbox( 'static risk parity', value=session_state.vanillariskparity, key='vanillariskparity', help= 'Static allocation to asset classes where weights are taken from https://www.theoptimizingblog.com/leveraged-all-weather-portfolio/ (see section "True Risk Parity").' ) session_state.onlystocks = st.sidebar.checkbox( 'only equity', value=session_state.onlystocks, key='onlystocks', help= 'Static allocation only to the equity class. Assets are allocated uniformly within the equity class.' ) session_state.sixtyforty = st.sidebar.checkbox( '60% equities 40% bonds', value=session_state.sixtyforty, key='sixtyforty', help= 'Static allocation 60% to the equity class, 20% to the Long Term Bonds class and 20% to the Short Term Bonds class. Assets are allocated uniformly within the asset classes.' ) session_state.trend_u = st.sidebar.checkbox( 'trend uniform', value=session_state.trend_u, key='trend_u', help= 'First weights are assigned according to the "uniform" strategy. Then, if the current asset price is smaller than the simple moving average, the weight is set to zero (leave as cash).' ) session_state.absmom_u = st.sidebar.checkbox( 'absolute momentum uniform', value=session_state.absmom_u, key='absmom_u', help= 'First weights are assigned according to the "uniform" strategy. Then, if the asset return over the period (momentum) is less than 0, the weight is set to zero (leave as cash).' ) session_state.relmom_u = st.sidebar.checkbox( 'relative momentum uniform', value=session_state.relmom_u, key='relmom_u', help= 'First assets are ranked based on their return over the period (momentum) and divided in two classes. The portfolio is formed by the assets belonging to the higher return class. Then, weights are assigned to this portfolio according to the "uniform" strategy.' ) session_state.momtrend_u = st.sidebar.checkbox( 'relative momentum & trend uniform', value=session_state.momtrend_u, key='momtrend_u', help= 'First weights are assigned according to the "uniform" strategy. Second, assets are ranked based on their return over the period (momentum) and divided in two classes. For the assets belonging to the lower return class, the weight is set to zero (leave as cash). Finally, a trend filter is then applied to assets with positive weight: if the current asset price is smaller than the simple moving average, the weight is set to zero (leave as cash).' ) session_state.trend_rp = st.sidebar.checkbox( 'trend risk parity', value=session_state.trend_rp, key='trend_rp', help= 'First weights are assigned according to the "riskparity" strategy. Then, if the current asset price is smaller than the simple moving average, the weight is set to zero (leave as cash).' ) session_state.absmom_rp = st.sidebar.checkbox( 'absolute momentum risk parity', value=session_state.absmom_rp, key='absmom_rp', help= 'First weights are assigned according to the "riskparity" strategy. Then, if the asset return over the period (momentum) is less than 0, the weight is set to zero (leave as cash).' ) session_state.relmom_rp = st.sidebar.checkbox( 'relative momentum risk parity', value=session_state.relmom_rp, key='relmom_rp', help= 'First assets are ranked based on their return over the period (momentum) and divided in two classes. The portfolio is formed by the assets belonging to the higher return class. Then, weights are assigned to this portfolio according to the "risk parity" strategy.' ) session_state.momtrend_rp = st.sidebar.checkbox( 'relative momentum & trend risk parity', value=session_state.momtrend_rp, key='momtrend_rp', help= 'First weights are assigned according to the "riskparity" strategy. Second, assets are ranked based on their return over the period (momentum) and divided in two classes. For the assets belonging to the lower return class, the weight is set to zero (leave as cash). Finally, a trend filter is then applied to assets with positive weight: if the current asset price is smaller than the simple moving average, the weight is set to zero (leave as cash).' ) session_state.GEM = st.sidebar.checkbox( 'Global equity momentum', value=session_state.GEM, key='GEM', help= 'Global equity momentum strategy. Needs only 4 assets of classes equity, equity_intl, bond_lt, money_market. example: `VEU,IVV,BIL,AGG equity_intl,equity,money_market,bond_lt`. See https://blog.thinknewfound.com/2019/01/fragility-case-study-dual-momentum-gem/' ) session_state.acc_dualmom = st.sidebar.checkbox( 'Accelerating Dual Momentum', value=session_state.acc_dualmom, key='acc_dualmom', help= 'Accelerating Dual Momentum. Needs only 3 assets of classes equity, equity_intl, bond_lt. example: VFINX,VINEX,VUSTX, shareclass equity,equity_intl,bond_lt. See https://engineeredportfolio.com/2018/05/02/accelerating-dual-momentum-investing/' ) session_state.acc_dualmom2 = st.sidebar.checkbox( 'Accelerating Dual Momentum (extended)', value=session_state.acc_dualmom2, key='acc_dualmom2', help= 'Accelerating Dual Momentum (extended). Needs only 4 assets of classes equity, equity_intl, bond_lt, gold. example: VFINX,VINEX,VUSTX,GLD shareclass equity,equity_intl,bond_lt,gold.' ) st.sidebar.subheader("HTML Report") # session_state.create_report = st.sidebar.checkbox('create PDF report', value=session_state.create_report, # key='create_report', help=None) session_state.report_name = st.sidebar.text_input( "report name", value=session_state.report_name, max_chars=None, key='report_name', help=None) session_state.user = st.sidebar.text_input( "user name", value=session_state.user, max_chars=None, key='user', help='user generating the report') session_state.memo = st.sidebar.text_input( "report memo", value=session_state.memo, max_chars=None, key='memo', help='description of the report') #launch_btn = st.button("Launch backtest") launch_btn = st.form_submit_button("Launch backtest") if launch_btn: params['startdate'] = session_state.startdate params['enddate'] = session_state.enddate params['initial_cash'] = session_state.initial_cash params['contribution'] = session_state.contribution params['leverage'] = session_state.leverage params['expense_ratio'] = session_state.expense_ratio params['historic'] = historic_cd params['shares'] = session_state.shares params['shareclass'] = session_state.shareclass params['weights'] = session_state.weights params['benchmark'] = session_state.benchmark params['indicator'] = session_state.indicator params['riskparity'] = session_state.riskparity params['riskparity_nested'] = session_state.riskparity_nested params['rotationstrat'] = session_state.rotationstrat params['uniform'] = session_state.uniform params['vanillariskparity'] = session_state.vanillariskparity params['onlystocks'] = session_state.onlystocks params['sixtyforty'] = session_state.sixtyforty params['trend_u'] = session_state.trend_u params['absmom_u'] = session_state.absmom_u params['relmom_u'] = session_state.relmom_u params['momtrend_u'] = session_state.momtrend_u params['trend_rp'] = session_state.trend_rp params['absmom_rp'] = session_state.absmom_rp params['relmom_rp'] = session_state.relmom_rp params['momtrend_rp'] = session_state.momtrend_rp params['GEM'] = session_state.GEM params['acc_dualmom'] = session_state.acc_dualmom params['acc_dualmom2'] = session_state.acc_dualmom2 params['create_report'] = session_state.create_report params['report_name'] = session_state.report_name params['user'] = session_state.user params['memo'] = session_state.memo # advanced params params['DAYS_IN_YEAR'] = session_state.DAYS_IN_YEAR params[ 'DAYS_IN_YEAR_BOND_PRICE'] = session_state.DAYS_IN_YEAR_BOND_PRICE params['reb_days_days'] = session_state.reb_days_days params['reb_days_years'] = session_state.reb_days_years params['reb_days_custweights'] = session_state.reb_days_custweights params[ 'lookback_period_short_days'] = session_state.lookback_period_short_days params[ 'lookback_period_short_years'] = session_state.lookback_period_short_years params[ 'lookback_period_short_custweights'] = session_state.lookback_period_short_custweights params[ 'lookback_period_long_days'] = session_state.lookback_period_long_days params[ 'lookback_period_long_years'] = session_state.lookback_period_long_years params[ 'lookback_period_long_custweights'] = session_state.lookback_period_long_custweights params[ 'moving_average_period_days'] = session_state.moving_average_period_days params[ 'moving_average_period_years'] = session_state.moving_average_period_years params[ 'moving_average_period_custweights'] = session_state.moving_average_period_custweights params['momentum_period_days'] = session_state.momentum_period_days params['momentum_period_years'] = session_state.momentum_period_years params[ 'momentum_period_custweights'] = session_state.momentum_period_custweights params[ 'momentum_percentile_days'] = session_state.momentum_percentile_days params[ 'momentum_percentile_years'] = session_state.momentum_percentile_years params[ 'momentum_percentile_custweights'] = session_state.momentum_percentile_custweights params['corrmethod_days'] = session_state.corrmethod_days params['corrmethod_years'] = session_state.corrmethod_years params['corrmethod_custweights'] = session_state.corrmethod_custweights params['riskfree'] = session_state.riskfree params['targetrate'] = session_state.targetrate params['alpha'] = session_state.alpha params['market_mu'] = session_state.market_mu params['market_sigma'] = session_state.market_sigma params['stddev_sample'] = session_state.stddev_sample params['annualize'] = session_state.annualize params['logreturns'] = session_state.logreturns #if input_df != 0: mainout = main(params) if mainout is not False: input_df = copy.deepcopy(mainout) # Portfolio value idx = 0 columns = input_df[idx].columns input_df[idx]['date'] = input_df[idx].index input_df_long = pd.melt(input_df[idx], id_vars=['date'], value_vars=columns, var_name='strategy', value_name='price') fig = px.line(input_df_long, x="date", y="price", color="strategy") st.markdown("### Portfolio value") st.plotly_chart(fig, use_container_width=True) # Portfolio drawdowns idx = 5 # find a smarter way later columns = input_df[idx].columns input_df[idx]['date'] = input_df[idx].index input_df_long = pd.melt(input_df[idx], id_vars=['date'], value_vars=columns, var_name='strategy', value_name='drawdown') fig = px.line(input_df_long, x="date", y="drawdown", color="strategy") st.markdown("### Portfolio drawdown") st.plotly_chart(fig, use_container_width=True) # Portfolio metrics st.markdown("### Portfolio metrics") st.dataframe(input_df[2]) # Portfolio weights st.markdown("### Portfolio weights") # col1, col2 = st.beta_columns(2) # # idx = 3 # columns=input_df[idx].columns # input_df[idx]['date'] = input_df[idx].index # input_df_long = pd.melt(input_df[idx], id_vars=['date','strategy'], value_vars=columns[0:-1],var_name='asset', value_name='weight') # # col1.subheader("Target weights") # # for strat in input_df_long['strategy'].unique(): # fig = px.bar(input_df_long[input_df_long['strategy']==strat], x="date", y="weight", color="asset", title=strat + ' weights') # col1.plotly_chart(fig, use_container_width=True) idx = 4 columns = input_df[idx].columns input_df[idx]['date'] = input_df[idx].index input_df_long = pd.melt(input_df[idx], id_vars=['date', 'strategy'], value_vars=columns[0:-1], var_name='asset', value_name='weight') st.subheader("Effective weights") for strat in input_df_long['strategy'].unique(): fig = px.bar(input_df_long[input_df_long['strategy'] == strat], x="date", y="weight", color="asset", title=strat + ' weights') st.plotly_chart(fig, use_container_width=True) # Asset value idx = 6 columns = input_df[idx].columns input_df[idx]['date'] = input_df[idx].index input_df_long = pd.melt(input_df[idx], id_vars=['date'], value_vars=columns, var_name='asset', value_name='price') fig = px.line(input_df_long, x="date", y="price", color="asset") st.markdown("### Assets value") st.plotly_chart(fig, use_container_width=True) # Assets drawdowns idx = 7 # find a smarter way later columns = input_df[idx].columns input_df[idx]['date'] = input_df[idx].index input_df_long = pd.melt(input_df[idx], id_vars=['date'], value_vars=columns, var_name='asset', value_name='drawdown') fig = px.line(input_df_long, x="date", y="drawdown", color="asset") st.markdown("### Assets drawdown") st.plotly_chart(fig, use_container_width=True) # # Portfolio Returns idx = 1 # Determine the price frequency dates = [] for i in range(1, len(input_df[idx].index)): dates.append( datetime.strptime(str(input_df[idx].index[i]), '%Y-%m-%d')) datediff = stats.mode(np.diff(dates))[0][0] if datediff > timedelta(days=250): frequency = "Years" elif datediff < timedelta(days=2): frequency = "Days" rolling_ret_period = st.slider( "rolling returns period (in years)", min_value=1, max_value=30, value=1, step=1, format='%i', key='rolling_ret_period', help='period of rolling annual return (in years)') if frequency == "Days": # plot the rolling return (annualized) for column in input_df[idx]: if params['logreturns']: input_df[idx][column] = (input_df[idx][column]).rolling( window=params['DAYS_IN_YEAR'] * rolling_ret_period).sum() / rolling_ret_period else: input_df[idx][column] = ( 1 + input_df[idx][column]).rolling( window=params['DAYS_IN_YEAR'] * rolling_ret_period).apply( np.prod)**(1 / rolling_ret_period) - 1 elif frequency == "Years": # plot the rolling 5 years return for column in input_df[idx]: if params['logreturns']: input_df[idx][column] = (input_df[idx][column]).rolling( window=rolling_ret_period).mean() else: input_df[idx][column] = ( 1 + input_df[idx][column]).rolling( window=rolling_ret_period).apply(np.prod) - 1 columns = input_df[idx].columns input_df[idx]['date'] = input_df[idx].index input_df_long = pd.melt(input_df[idx], id_vars=['date'], value_vars=columns, var_name='strategy', value_name='rolling return') fig = px.line(input_df_long, x="date", y="rolling return", color="strategy") st.markdown("### Portfolio returns") st.plotly_chart(fig, use_container_width=True) st.markdown("### Downloads area") today_str = datetime.today().strftime('%Y-%m-%d') outputfilename = [ "Fund Prices", "Returns", "Performance Metrics", "Target Weights", "Effective Weights", "Portfolio Drawdown", "Asset Prices", "Assets drawdown" ] i = 0 for name in outputfilename: inputfilepath = name + "_" + today_str + '.csv' tmp_download_link = utils.download_link( input_df[i], inputfilepath, 'Click here to download ' + name) st.markdown(tmp_download_link, unsafe_allow_html=True) i = i + 1 inputfilepath = params['report_name'] + "_" + today_str + '.html' tmp_download_link = utils.download_link( input_df[8], inputfilepath, 'Click here to download the html report') st.markdown(tmp_download_link, unsafe_allow_html=True)
# %matplotlib inline import matplotlib.pyplot as plt import seaborn as sns; sns.set() import numpy as np from sklearn.cluster import KMeans from sklearn.datasets import load_digits digits = load_digits() digits.data.shape kmeans = KMeans(n_clusters = 10, random_state = 0) clusters = kmeans.fit_predict(digits.data) kmeans.cluster_centers_.shape fig, ax = plt.subplots(2, 5, figsize=(8, 3)) centers = kmeans.cluster_centers_.reshape(10, 8, 8) for axi, center in zip(ax.flat, centers): axi.set(xticks=[], yticks=[]) axi.imshow(center, interpolation='nearest', cmap=plt.cm.binary) from scipy.stats import mode labels = np.zeros_like(clusters) for i in range(10): mask = (clusters == i) labels[mask] = mode(digits.target[mask])[0] from sklearn.metrics import accuracy_score accuracy_score(digits.target, labels) """###The above output shows that the accuracy is around 80%."""
def votepredict(tot_predicted): tot_predicted = np.transpose(tot_predicted) vote_predicted = [mode(w).mode[0] for w in tot_predicted] return vote_predicted
def _extractFeature(self, baseModality: str = "audio", num_word: int = 1, verbose:int = 0, **kwargs): """ recipe: dictionary, required file list to extract feature on each modality baseModality: string, optional, default="audio" base file length for aligning all the other modalities """ # check arguments recipe = kwargs["recipe"] isFlattened = kwargs["isFlattened"] isOnehot = kwargs["isOnehot"] # extract feature from each file self.num_files = len(recipe[list(recipe.keys())[0]]) if verbose > 0: fileIdxIterator = tqdm(np.arange(self.num_files), ascii=True, desc="extracting") else: fileIdxIterator = np.arange(self.num_files) features = dict() for fileIdx in fileIdxIterator: min_length = sys.maxsize for modality in recipe.keys(): features_per_file = self.singleFileExtractor.getXy(fileName=recipe[modality][fileIdx], modality=modality, verbose=verbose) if modality in features.keys(): features[modality].append(features_per_file) else: features[modality] = [features_per_file] if modality != "text": min_length = min(min_length, len(features_per_file)) # align length of each modality for modality in recipe.keys(): features[modality][fileIdx] = features[modality][fileIdx][:min_length] if self.sample_shift > 0: feature_shape = dict() num_total_sample = dict() for modality in features.keys(): for features_per_file in features[modality]: # store the shapes in each modalities if modality not in feature_shape.keys(): feature_shape[modality] = features_per_file[0].shape # store the length in each modalities if modality not in num_total_sample.keys(): num_total_sample[modality] = int( (len(features_per_file) - self.window_size) / self.sample_shift) else: num_total_sample[modality] += int( (len(features_per_file) - self.window_size) / self.sample_shift) print("feature_shape: {0}".format(feature_shape)) print("num_total_sample: {0}".format(num_total_sample)) base_num_sample = [] for modality in features.keys(): if verbose > 0: print("sampling... modality:{0}".format(modality)) # create empty array for samples per one modality if modality == "text": samples = np.zeros((num_total_sample[baseModality], ) + num_word * feature_shape[modality]) elif modality == "ref" or modality == "label": samples = np.zeros((num_total_sample[modality], ) + feature_shape[modality]) else: if isFlattened: samples = np.zeros((num_total_sample[modality], self.window_size * np.prod(feature_shape[modality]))) else: samples = np.zeros((num_total_sample[modality], self.window_size) + feature_shape[modality]) file_shift = 0 for fileIdx, features_per_file in enumerate(features[modality]): if modality == "text": num_sample = base_num_sample[fileIdx] num_word_per_file = len(features_per_file) else: num_sample = int( (len(features_per_file) - self.window_size) / self.sample_shift) # store number of samples at each file on base modality if modality == baseModality: base_num_sample.append(num_sample) for sampleIdx in range(num_sample): if modality == "text": start = int(sampleIdx / num_sample * num_word_per_file) end = int(sampleIdx / num_sample * num_word_per_file) + 1 else: start = sampleIdx * self.sample_shift end = sampleIdx * self.sample_shift + self.window_size if modality == "ref" or modality == "label": mode_val, mode_num = stats.mode(features_per_file[start:end]) sample = mode_val else: if isFlattened: sample = np.array(features_per_file[start:end]).flatten() else: sample = np.array(features_per_file[start:end]) samples[file_shift + sampleIdx] = sample file_shift += num_sample features[modality] = samples return features
depths = np.arange(1, num_depths + 1) for depth in depths: y = np.zeros((n, len(test_data))) # We are assuming that N(x) = 0, so there's no noise. This means y_star = y_t y_star = t = test_labels for i in range(n): boot_data, boot_labels = bootstrap_replicate( train_data, train_labels) tree = DecisionTree(boot_data, boot_labels, attributes, p_threshold=p_max, max_level=depth) y[i] = tree.classify(test_data) # Under zero-one loss the main prediction is the mode (least squares: mean, absolute loss: median) y_m = st.mode(y, 0)[0][0] # What's the overall test accuracy of our prediction: correct / (correct + incorrect) accuracy[depth - 1] = sum(np.asarray(y_m == y_star, int)) / len(y_star) # Bias: average zero-one loss between the optimal and main predictions bias[depth - 1] = np.mean(zero_one_loss(y_star, y_m)) # Variance: average {across examples} of [(+1 if main = optimal, -1 otherwise) * # average {across test datasets} zero-one loss between individual predictions and main prediction c2 = np.asarray(y_m == y_star, dtype=int) * 2 - 1 # 1 if y_m == y_star, -1 otherwise loss_ym_y = np.array([zero_one_loss(y_m, y_i) for y_i in y]) variance[depth - 1] = np.mean(c2 * np.mean(loss_ym_y, 0)) # Plot Bias, Variance and overall accuracy plt.plot(depths, bias) plt.plot(depths, variance) plt.plot(depths, accuracy, ls='--')
data = data.iloc[1:] # DATA PREPROCESSING data_convoluted = [] labels = [] # Slide a "SEGMENT_TIME_SIZE" wide window with a step size of "TIME_STEP" for i in range(0, len(data) - SEGMENT_TIME_SIZE, TIME_STEP): eF = data['ElbowFlexion'].values[i: i + SEGMENT_TIME_SIZE] eS = data['ElbowSupination'].values[i: i + SEGMENT_TIME_SIZE] sF = data['ShoulderFlexion'].values[i: i + SEGMENT_TIME_SIZE] sA = data['ShoulderAbduction'].values[i: i + SEGMENT_TIME_SIZE] sR = data['ShoulderRotation'].values[i: i + SEGMENT_TIME_SIZE] data_convoluted.append([eF, eS, sF, sA, sR]) # Label for a data window is the label that appears most commonly label = stats.mode(data['Label'][i: i + SEGMENT_TIME_SIZE])[0][0] labels.append(label) print("Convoluted data shape: ", np.array(data_convoluted).shape) # Convert to numpy data_convoluted = np.asarray(data_convoluted, dtype=np.float32).transpose(0, 2, 1) # One-hot encoding labels = np.asarray(pd.get_dummies(labels), dtype=np.float32) #print("Convoluted data shape: ", data_convoluted.shape) #print("Labels shape:", labels.shape) # SPLIT INTO TRAINING AND TEST SETS X_train, X_test, y_train, y_test = train_test_split(data_convoluted, labels, test_size=0.3, random_state=RANDOM_SEED) #print("X train size: ", len(X_train)) print("X test size: ", len(X_test))
"@Author: @learn.machinelearning" import numpy as np from scipy import stats # Python code for Mean, Median, Mode # Theory https://www.instagram.com/p/BtPtUJRHd16/ import numpy as np from scipy import stats vector_A = np.array([[1, 1, 2, 3, 4, 6, 18]]) #mean value mean = np.mean(vector_A) #median value median = np.median(vector_A) #mode value mode = stats.mode(vector_A) print("Mean: ", mean) print("Median: ", median) print("Mode: ", mode[1][0][0])
""" Created on Mon Feb 26 15:57:49 2018 @author: NI389899 """ import pandas as pd import numpy as np from scipy import stats #Reading excel file using pandas dataset = pd.read_excel("Stats.xlsx") exp_years = dataset.loc[:, "YearsOfExp"].values salary = dataset.loc[:, "Salary in Rs."].values #Getting mean,mode,median using numpy in-built functions mean_exp = np.mean(exp_years) mean_sal = np.mean(salary) mode_exp = stats.mode(exp_years) mode_sal = stats.mode(salary) median_exp = np.median(exp_years) median_sal = np.median(salary) #Printing obtained values print("Mean Years Of Experience: ", mean_exp) print("Mean Salary: ", mean_sal) print("Mode Years Of Experience: ", mode_exp[0][0]) print("Mode Salary: ", mode_sal[0][0]) print("Median Years Of Experience: ", median_exp) print("Median Salary: ", median_sal)
from sklearn.metrics import mean_squared_error from sklearn import linear_model from scipy.stats import mode import numpy as np df_train = pd.read_csv('/Users/tavleenkaur/Documents/fractal/train.csv') test_df = pd.read_csv('/Users/tavleenkaur/Documents/fractal/test.csv') all_items = df_train.Item_ID.unique() for item in all_items: item_df = pd.DataFrame(df_train.loc[df_train['Item_ID'] == item, ['Datetime', 'Item_ID', 'ID', 'Category_1', 'Category_2', 'Category_3', 'Price', 'Number_Of_Sales']]) item_df['Category_2'].fillna(mode(item_df['Category_2']).mode[0], inplace=True) item_df['Category_3'].fillna(item_df['Category_3'].mode()) item_df['Category_1'].fillna(item_df['Category_1'].mean(), inplace=True) item_df['Price'].fillna(item_df['Price'].mean() frames.append(item_df) # ----- For plotting ------ # temp = {} # for b in bins: # temp[b]= item_df.loc[item_df['bin'] == b, 'Number_Of_Sales'].sum() # plt.plot(list(temp.keys()), list(temp.values())) final_df = pd.concat(frames) # Initialise regression model regr = linear_model.LinearRegression()
target_x = tf.placeholder("float", [1,784]) #target vector X = tf.placeholder("float", [None, 784]) #matrix of observations to compare to target y = tf.placeholder("float", [None, 10]) #matrix of one-hot class vectors l1_dist= tf.reduce_sum(tf.abs(tf.sub(X, target_x)), 1) #euclidean distance. the sum of squared differences between elements, row-wise. l2_dist = tf.reduce_sum(tf.square(tf.sub(X, target_x)), 1) #euclidean distance. the sum of squared differences between elements, row-wise. #nn = tf.argmin(l1_dist, 0) nn = tf.nn.top_k(-l1_dist, k) init = tf.initialize_all_variables() accuracy_history = [] with tf.Session() as sess: sess.run(init) for obs in range(X_test.shape[0]): nn_index = sess.run(nn, feed_dict = {X: X_train, y: y_train, target_x: np.asmatrix(X_test[obs])}) pred_classes = [] for i in range(k): nn_class = np.argmax(y_train[nn_index[1][i]]) #print nn_class pred_classes.append(nn_class) predicted_class = stats.mode(pred_classes)[0][0] true_class = np.argmax(y_test[obs]) print "True class: " + str(true_class) + ", predicted class: " + str(predicted_class) if predicted_class == true_class: accuracy_history.append(1) else: accuracy_history.append(0) print "model was " + str(np.mean(accuracy_history)) + "% accurate"