Example #1
0
def estimator_knn_cv(X, y, clf, n_neigh):
    neigh = NearestNeighbors(n_neigh, metric="euclidean", algorithm="brute")
    neigh_est = NearestNeighbors(n_neigh, metric="manhattan", algorithm="brute")
    acc = []
    for train, test in StratifiedKFold(y, 5):
        X_train = X[train]
        y_train = y[train]
        X_test = X[test]
        y_test = y[test]
        clf.fit(X_train, y_train)
        estimators = clf.estimators_
        preds_train = np.array(map(lambda e: e.predict(X_train), estimators)).T
        preds_test = np.array(map(lambda e: e.predict(X_test), estimators)).T
        preds_train_proba = np.array(map(lambda e: e.predict_proba(X_train), estimators))
        preds_test_proba = np.array(map(lambda e: e.predict_proba(X_test), estimators))
        p_train = preds_train_proba.swapaxes(0, 1)[:, :, 0]
        p_test = preds_test_proba.swapaxes(0, 1)[:, :, 0]
        neigh.fit(X_train)
        dist, knn = neigh.kneighbors(X_test)
        neigh_est.fit(preds_train)
        dist, knn_est = neigh_est.kneighbors(preds_test)
        # neigh_est.fit(p_train);dist, knn_est = neigh_est.kneighbors(p_test)
        knn_combined_uniq = np.array(map(np.unique, np.hstack((knn[:, :30], knn_est[:, :30]))))
        pp_uniq = np.array([stats.mode(y_train[nn])[0][0] for nn in knn_combined_uniq])
        # pp_uniq = np.array([stats.mode(y_train[nn])[0][0] for nn in knn[:,:30]])
        preds_test_est_knn = np.array(
            [[stats.mode(y_train[nn])[0][0] for nn in knn_est[:, :i]] for i in xrange(1, n_neigh, 2)]
        )
        acc.append(
            [accuracy_score(y_test, pred) for pred in np.vstack((preds_test_est_knn, clf.predict(X_test), pp_uniq))]
        )
    mean_acc = np.mean(acc, axis=0)
    print " ".join("{:.3f}".format(v) for v in mean_acc), " max:{:.3f}".format(mean_acc.max())
Example #2
0
def predictedGroup(p, tr, nn = 3, e='s'):
  #Compares the points (p) to your tree (tr), providing its predicted 
  # category for each point based off of its (nn) nearest neighbors
  #Explaination can be short e[s] or long e[l]
  k = ann.kdtree(tr[:,:tr.shape[1]-1])
  l = k.knn(p[:,:p.shape[1]-1],3)
  #   dist = distGroups(tr)
  #   pr = []
  print "l[0]"; print l[0], "\n"; 
  #   print dist; 
  #   print "tr[0][-1] \n",tr[0][-1]
  if e == 's':
    for i in l[0]:
      pass
  else:
    print tr,"\n"; print p,"\n";
    ll = np.zeros((l[0].shape[0],p.shape[1]))
    kk = 0
    for i in l[0]:
      ii = 0
      for j in i:
        ll[kk][ii] = tr[j][-1]
        ii += 1
      kk += 1
    print "Groups \n", ll; print "Modes \n",stat.mode(ll,1)
  pred = assignGroup(p,stat.mode(ll,1)[0])
  print pred
  return pred
 def test_03_02_circle(self):
     '''Test the module on a uniform circle'''
     i,j = np.mgrid[-50:51,-50:51]
     labels = (np.sqrt(i*i+j*j) <= 40).astype(int)
     m, workspace = self.run_module(
         np.ones(labels.shape), labels, wants_workspace=True)
     assert isinstance(workspace, cpw.Workspace)
     bins = labels * (1 + (np.sqrt(i*i+j*j) / 10).astype(int))
     for bin in range(1,5):
         data = m.get_current_measurement(OBJECT_NAME, 
                                          feature_frac_at_d(bin, 4))
         self.assertEqual(len(data), 1)
         area = (float(bin) * 2.0 - 1.0)/16.0
         self.assertTrue(data[0] > area - .1)
         self.assertTrue(data[0] < area + .1)
         heatmap = workspace.image_set.get_image(
             HEAT_MAP_NAME + M.F_FRAC_AT_D).pixel_data
         data = data.astype(heatmap.dtype)
         self.assertEqual(mode(heatmap[bins == bin])[0][0], data[0])
         data = m.get_current_measurement(OBJECT_NAME,
                                          feature_mean_frac(bin, 4))
         self.assertEqual(len(data), 1)
         self.assertAlmostEqual(data[0], 1, 2)
         heatmap = workspace.image_set.get_image(
             HEAT_MAP_NAME + M.F_MEAN_FRAC).pixel_data
         data = data.astype(heatmap.dtype)
         self.assertEqual(mode(heatmap[bins == bin])[0][0], data[0])
         data = m.get_current_measurement(OBJECT_NAME,
                                          feature_radial_cv(bin, 4))
         self.assertEqual(len(data), 1)
         self.assertAlmostEqual(data[0], 0, 2)
         heatmap = workspace.image_set.get_image(
             HEAT_MAP_NAME + M.F_RADIAL_CV).pixel_data
         data = data.astype(heatmap.dtype)
         self.assertEqual(mode(heatmap[bins == bin])[0][0], data[0])
def statistics():
    global data
    data=[i.split(',') for i in data.splitlines()]
    column_names = data[0]
    data_rows = data[1:]
    df = pd.DataFrame(data_rows, columns = column_names)
    df["Alcohol"]=df["Alcohol"].astype(float)
    df["Tobacco"]=df["Tobacco"].astype(float)

    print "Alcohol dataset stats:"
    print "Mean = ",df['Alcohol'].mean()
    print "Median =",df['Alcohol'].median()
    print "Mode =", stats.mode(df["Alcohol"])
    print "Range =",max(df['Alcohol']) - min(df['Alcohol'])
    print "Variance =",df['Alcohol'].var()
    print "Standard Deviation =",df['Alcohol'].std()
    print "\n"

    print "Tobacco dataset stats:"
    print "Mean = ",df['Tobacco'].mean()
    print "Median =",df['Tobacco'].median()
    print "Mode =",stats.mode(df["Tobacco"])[0][0]
    print "Range =",max(df['Tobacco']) - min(df['Tobacco'])
    print "Variance =",df['Tobacco'].var()
    print "Standard Deviation =",df['Tobacco'].std()
def give_me_seperation_and_repitition(min_list):
    seperation_list=[]
    sep_and_repeat_list = []
    
    for i in range(1,len(min_list)):
        diff = min_list[i] - min_list[i-1]
        seperation_list.append(diff)
        
    #print seperation_list
   
    repeated_seperation = mode(seperation_list)
    while repeated_seperation[1][0] > 1:
        sep_and_repeat_list.append([repeated_seperation[0][0],repeated_seperation[1][0]])
        while repeated_seperation[0][0] in seperation_list :
            seperation_list.remove(repeated_seperation[0][0])
        repeated_seperation = mode(seperation_list)

    #print sep_and_repeat_list
    
    if sep_and_repeat_list != []:
        for i in range(len(sep_and_repeat_list)):
            #print sep_and_repeat_list[i][1]
            if sep_and_repeat_list[i][1] / len(seperation_list) < 0.1 : # it means less than this percent
                sep_and_repeat_list[i] = [0,0]
    while [0, 0] in sep_and_repeat_list : sep_and_repeat_list.remove([0,0])
    return sep_and_repeat_list
def lesson():
    global data
    data=[i.split(',') for i in data.splitlines()]
    column_names = data[0]
    data_rows = data[1:]
    df = pd.DataFrame(data_rows, columns = column_names)
    df["Alcohol"]=df["Alcohol"].astype(float)
    df["Tobacco"]=df["Tobacco"].astype(float)

    print df['Alcohol'].mean()
    print df['Alcohol'].median()
    print stats.mode(df["Alcohol"])[0][0]
    print '\n'
    print df['Tobacco'].mean()
    print df['Tobacco'].median()
    print stats.mode(df["Tobacco"])[0][0]
    print '\n'
    print max(df['Alcohol']) - min(df['Alcohol'])
    print df['Alcohol'].std()
    print df['Alcohol'].var()
    print '\n'
    print max(df['Tobacco']) - min(df['Tobacco'])
    print df['Tobacco'].std()
    print df['Tobacco'].var()
    print "\n"
    ##z-score of Utopian people
    mean = 251
    std = 20

    x = 2.3*std+mean
    print "the days corresponding to a z-score of 2.3 is",x
Example #7
0
    def add_instance(self, instance, target, representation=None):
        """Adds the given instance, target and representation to the corpus.

        Args:
            instance: a vector with shape equals to self.instances.shape[1]
            target: a list of string representing the classes.
            representation: a string
        """
        if isinstance(self.instances, scipy.sparse.csr.csr_matrix):
            instance = csr_matrix(instance)
            self.instances = vstack((self.instances, instance), format='csr')
        else:
            self.instances = csr_matrix(instance)
        self.full_targets.append(target)
        self.representations.append(representation)
        if target:
            if mode(target)[1][0] != 1:
                self.primary_targets.append(mode(target)[0][0])
            else:
                self.primary_targets.append(target[0])
        else:
            self.primary_targets.append(None)

        for key in self.extra_info:
            self.extra_info[key].append(0)
def back_to_numbers(class_scores,scores_to_lables_lists,numclass):
	from scipy.stats import mode
	back_to_values_mean   = np.zeros(len(class_scores))
	back_to_values_mode_small   = np.zeros(len(class_scores))
	back_to_values_mode_larger   = np.zeros(len(class_scores))
	back_to_values_median = np.zeros(len(class_scores))
	back_to_values_max    = np.zeros(len(class_scores))
	back_to_values_min    = np.zeros(len(class_scores))
	lables = ['A','B','C','D','E','F','G','H','I','J','K']
	numbers_lables_dict = dict()
	for j in range(0,11):
		numbers_lables_dict[lables[j]] = j

	for i in range(len(class_scores)):
		cs = class_scores[i]
		bin = numbers_lables_dict[cs]

		back_to_values_mean[i]  		= np.array(scores_to_lables_lists[bin]).mean()
		back_to_values_mode_small[i]   	= mode(scores_to_lables_lists[bin])[0][0]
		back_to_values_mode_larger[i] 	= mode(scores_to_lables_lists[bin])[1][0] 
		back_to_values_median[i] 		= np.median(scores_to_lables_lists[bin])
		back_to_values_max[i]    		= np.array(scores_to_lables_lists[bin]).max()
		back_to_values_min[i]    		= np.array(scores_to_lables_lists[bin]).min()
		
		





	return [back_to_values_mean,back_to_values_mode_small,back_to_values_mode_larger,back_to_values_median,back_to_values_max,back_to_values_min ] 
Example #9
0
    def predict(self, X):
        """Predict the class labels for the provided data

        Parameters
        ----------
        X: array
            A 2-D array representing the test points.

        Returns
        -------
        labels: array
            List of class labels (one for each data sample).
        """
        X = atleast2d_or_csr(X)

        if self.classification_type == 'knn_vote':
            neigh_ind = self.kneighbors(X, return_distance=False)
            pred_labels = self._y[neigh_ind]
            mode, _ = stats.mode(pred_labels, axis=1)
            return mode.flatten().astype(np.int)
        else:
            neigh_ind = self.radius_neighbors(X, return_distance=False)
            pred_labels = [self._y[ind] for ind in neigh_ind]
            return np.asarray([stats.mode(pi) for pi in pred_labels],
                              dtype=np.int)
Example #10
0
def make_fused_set(features, labels, files, data_set, target_file):
    #Start fusing
    subject_idx = []
    subject_vol_dict = collections.defaultdict(list)
    subject_class_dict = collections.defaultdict(list)
    j=0
    #find volume indices for each unique subject
    for i in files:
        paresd_str = str(i).split('_')
        subject_id = int(paresd_str[4])
        subject_idx.append(subject_id)
        subject_vol_dict[subject_id].append(j)
        subject_class_dict[subject_id].append(labels[j])
        j=j+1

    print len(subject_vol_dict), len(subject_class_dict)
    fuse_vol_array = np.zeros((len(set(subject_idx)),features.shape[1]))
    fuse_class_array = np.zeros(len(set(subject_idx)))
    j=0
    for i in set(subject_idx):
        fuse_vol = stats.mode(features[subject_vol_dict[i]])
        fuse_vol_array[j,:] = fuse_vol[0]
        fuse_class_array[j] = stats.mode(subject_class_dict[i])[0]
        j=j+1

    #Save fused files
    output_data = h5.File(target_file, 'a')
    output_data.create_dataset('{}_data_fused'.format(data_set), data=fuse_vol_array)
    output_data.create_dataset('{}_class_fused'.format(data_set), data=fuse_class_array)
    output_data.close()
Example #11
0
def new_data(data): #{
  z=[];                     #used to store the index of zeroed elements
  m,n=numpy.shape(data);    #get the dimensions of the matrix
  y=int(data[0,n-1]);       #extract the classification from the last column  
  sim_length = numpy.zeros(m) #create a 2 element array to store the lengths
  for i in range(m):
    z.append(numpy.nonzero(data[i,:]==0)[0])
    if len(z[i])>0:
      sim_length[i] = z[i][0]
    else:
      sim_length[i] = n

  longest = max(sim_length)
  shortest = min(sim_length)
  
  i=0
  for i in range(m):
    length = sim_length[i]
    if length<longest:
      stuffing = int(longest-length)
      start=int(numpy.floor(length/2))
      
      
      x= numpy.zeros((stuffing-1,n),dtype=int)
      in_proc_stretch = numpy.vstack((data[1,:],x))
      offset = start; 
      for j in range(1,stuffing):
        in_proc_stretch[j,0:offset]=in_proc_stretch[j-1,0:offset]
        md = stats.mode(data[0,0:sim_length[0]-1])
        in_proc_stretch[j,offset]= md[0]+numpy.random.random_integers(-3,3)
        in_proc_stretch[j,offset+1:length+1] = in_proc_stretch[j-1,offset:length]
        length=length+1
        offset=offset+1
      
      
    else:
      shrinkage = int(length-shortest)
      x = numpy.zeros((shrinkage-2,n),dtype=int)
      in_proc_shrink = numpy.vstack((data,x))
      in_proc_shrink[1,:]= 0
      
      for j in range(0,shrinkage-1):
        md = stats.mode(in_proc_shrink[j,0:length])
        ind = numpy.nonzero(in_proc_shrink[j,0:length]==md[0])
        in_proc_shrink[j+1,0:ind[0][-1]] = in_proc_shrink[j,0:ind[0][-1]]
        in_proc_shrink[j+1,ind[0][-1]:-2] = in_proc_shrink[j,(ind[0][-1]+1):-1]
        length = length-1
      
      
  in_proc_stretch[:,n-1]=y
  in_proc_stretch[:,0:start]=data[0,0:start]
  print in_proc_stretch
  in_proc_shrink[:,n-1]=y
  in_proc_shrink[:,0:start]=data[1,0:start]
  print in_proc_shrink
  
  sim_data = numpy.vstack((in_proc_stretch,in_proc_shrink))
  
  return sim_data
Example #12
0
def Assign_Parameters_Semidistributed(covariates,metadata,hydrobloks_info,OUTPUT,cluster_ids,mask):

 nclusters = hydrobloks_info['nclusters']
 #Initialize the arrays
 vars = ['area','area_pct','BB','DRYSMC','F11','MAXSMC','REFSMC','SATPSI',
         'SATDK','SATDW','WLTSMC','QTZ','slope','ti','dem','carea','channel',
         'land_cover','soil_texture_class',
         'mannings','m','psoil','pksat','sdmax']
 OUTPUT['hsu'] = {}
 for var in vars:
  OUTPUT['hsu'][var] = np.zeros(nclusters)

 #Metadata
 #NLCD2NOAH = {11:17,12:15,21:10,22:10,23:10,24:13,31:16,41:4,42:1,43:5,51:6,52:6,71:10,72:10,73:19,74:19,81:10,82:12,90:11,95:11}
 for hsu in np.arange(nclusters):
  #Set indices
  idx = np.where(cluster_ids == hsu)
  #Calculate area per hsu
  OUTPUT['hsu']['area'][hsu] = metadata['resx']**2*idx[0].size
  #Calculate area percentage per hsu
  OUTPUT['hsu']['area_pct'][hsu] = 100*OUTPUT['hsu']['area'][hsu]/(metadata['resx']**2*mask[mask].size)
  #Soil properties
  for var in ['BB','DRYSMC','F11','MAXSMC','REFSMC','SATPSI','SATDK','SATDW','WLTSMC','QTZ']:
   if var in ['SATDK','SATDW']:
    OUTPUT['hsu'][var][hsu] = stats.mstats.hmean(covariates[var][idx])
   else:
    OUTPUT['hsu'][var][hsu] = stats.mstats.gmean(covariates[var][idx])
  #OUTPUT['hsu']['SATDW'][hsu] = 0.0
  #Average Slope
  OUTPUT['hsu']['slope'][hsu] = np.mean(covariates['cslope'][idx])
  #print 'mean',np.mean(np.sin(covariates['cslope'][idx]))
  #print 'arcsin mean',np.arcsin(np.mean(np.sin(covariates['cslope'][idx])))
  #OUTPUT['hsu']['slope'][hsu] = np.arcsin(np.mean(np.sin(covariates['cslope'][idx])))
  #print np.min(covariates['cslope'][idx]),np.mean(covariates['cslope'][idx]),np.max(covariates['cslope'][idx])
  #print OUTPUT['hsu']['slope'][hsu]
  #Topographic index
  OUTPUT['hsu']['ti'][hsu] = np.mean(covariates['ti'][idx])
  #DEM
  OUTPUT['hsu']['dem'][hsu] = np.mean(covariates['dem'][idx])
  #Average Catchment Area
  OUTPUT['hsu']['carea'][hsu] = np.mean(covariates['carea'][idx])
  #Channel?
  OUTPUT['hsu']['channel'][hsu] = stats.mode(covariates['channels'][idx])[0]
  #Land cover type  
  #OUTPUT['hsu']['land_cover'][hsu] = NLCD2NOAH[95]#stats.mode(covariates['nlcd'][idx])[0][0]]
  OUTPUT['hsu']['land_cover'][hsu] = stats.mode(covariates['nlcd'][idx])[0][0]
  #Soil texture class
  OUTPUT['hsu']['soil_texture_class'][hsu] = stats.mode(covariates['TEXTURE_CLASS'][idx])[0][0]
  #Define the estimate for the model parameters
  OUTPUT['hsu']['m'][hsu] = 0.1 #Form of the exponential decline in conductivity (0.01-1.0)
  OUTPUT['hsu']['pksat'][hsu] = 1.0 #saturated hydraulic conductivity scalar multiplier (0.1-1.0)
  OUTPUT['hsu']['psoil'][hsu] = 1.0 #soil hydraulic properties (residual,wilting,field capacity, and porosity) (0.1-10.0)
  OUTPUT['hsu']['sdmax'][hsu] = 5.0 #maximum effective deficit of subsurface saturated zone (0.1-10.0)
  if np.max(covariates['carea'][idx]) >= 100000.0: OUTPUT['hsu']['mannings'][hsu] = 0.03 #manning's n for channel flow (0.01-0.1)
  else: OUTPUT['hsu']['mannings'][hsu] = 0.15 #manning's n for overland flow (0.01-0.8)

 return OUTPUT
Example #13
0
def classify_by_ola_proba(local_accuracy, pred):
    sorted_acc = np.sort(np.unique(local_accuracy))[::-1]
    for a in sorted_acc:
        acc_indices = np.where(local_accuracy == a)[0]
        val, count = stats.mode(pred[acc_indices])
        if count[0] > acc_indices.shape[0] / 2:
            return val[0]
    print "tie"
    return stats.mode(pred)[0][0]
Example #14
0
 def calculate_purity(self, vector_purity,labels,epoch,gamma=None):
     labels = labels[0:self.n_samples]
     if gamma== None:
         for i in range(self.n_clusters):
             vector_purity[epoch,i] = (mode(self.y[np.where(labels == i)])[1]/np.where(labels == i)[0].shape[0])[0]
         return vector_purity
     else:
         for i in range(self.n_clusters):
             vector_purity[epoch,i,gamma] = (mode(self.y[np.where(labels == i)])[1]/np.where(labels == i)[0].shape[0])[0]
         return vector_purity
Example #15
0
def statresults(agdia, audia):

	# Statistical report
	dfinal = pd.DataFrame([[len(audia)],[len(agdia)]], 
	columns=['Total particles counted'], index=['AuNP','AgNP'])
	dfinal['Mean Diameter (nm)'] = np.round([np.mean(audia),np.mean(agdia)],1)
	dfinal['Median Diameter (nm)'] = np.round([np.median(audia),np.median(agdia)],1)
	dfinal['Mode Diameter (nm)'] = np.round([mode(audia.tolist())[0],
	mode(agdia.tolist())[0]],1)
	
	return dfinal
Example #16
0
def reed_muller_decode(blocks):
	x1 = R[1]
	x2 = R[2]
	x3 = R[3]
	notx1 = (~x1 % 2)
	notx2 = (~x2 % 2)
	notx3 = (~x3 % 2)

	# Characteristic vectors and coefficients of x3
	x3_1 = np.logical_and(x1,x2) % 2
	x3_2 = np.logical_and(x1,notx2) % 2
	x3_3 = np.logical_and(notx1,x2) % 2
	x3_4 = np.logical_and(notx1,notx2) % 2
	v3 = np.array([x3_1,x3_2,x3_3,x3_4])
	all3 = np.dot(v3, blocks) % 2
	# c3 = stats.mode(all3)[0]
	(mode3, count3) = stats.mode(all3)
	c3 = ((mode3 == 1) * (count3 > all3.shape[0]/2)) % 2

	# Characteristic vectors and coefficients of x2
	x2_1 = np.logical_and(x1,x3) % 2
	x2_2 = np.logical_and(x1,notx3) % 2
	x2_3 = np.logical_and(notx1,x3) % 2
	x2_4 = np.logical_and(notx1,notx3) % 2
	v2 = np.array([x2_1,x2_2,x2_3,x2_4])
	all2 = np.dot(v2, blocks) % 2
	# c2 = stats.mode(all2)[0]
	(mode2, count2) = stats.mode(all2)
	c2 = ((mode2 == 1) * (count2 > all2.shape[0]/2)) % 2

	# Characteristic vectors and coefficients of x1
	x1_1 = np.logical_and(x2,x3) % 2
	x1_2 = np.logical_and(x2,notx3) % 2
	x1_3 = np.logical_and(notx2,x3) % 2
	x1_4 = np.logical_and(notx2,notx3) % 2
	v1 = np.array([x1_1,x1_2,x1_3,x1_4])
	all1 = np.dot(v1, blocks) % 2
	# c1 = stats.mode(all1)[0]
	(mode1, count1) = stats.mode(all1)
	c1 = ((mode1 == 1) * (count1 > all1.shape[0]/2)) % 2

	# Calculate coefficient of 0th row
	coefficients = np.concatenate((c1, c2, c3),axis=0)
	dotted = np.dot(coefficients.T, np.array([x1, x2, x3])).T % 2
	all0 = (dotted + blocks) % 2

	# If more 1's, then 1. Otherwise, 0.
	# This also handles case when equal number of 1's and 0's -> Set to 0
	(mode0, count0) = stats.mode(all0)
	c0 = ((mode0 == 1) * (count0 > all0.shape[0]/2)) % 2

	decoded = np.concatenate([c0, c1, c2, c3],axis=0)
	decoded = decoded.astype(int)
	return decoded
def _get_prf(res_set):

    res_set=np.array(res_set)
    modes=[]
    precs=[]
    recs=[]
    for res in res_set:
        modes.append(mode(res)[0][0])
        precs.append(mode(res)[0][0]/len(res))

    for m in modes:
        m=0
Example #18
0
 def calculate_purity(self, labels,epoch,n_iter=0,gamma=None):
     labels = labels[0:self.n_samples]
     if gamma== None:
         for i in range(self.n_clusters):
             self.purity['kmeans'][epoch,i,n_iter] = \
                 (mode(self.y[np.where(labels == i)])[1]/np.where(labels == i)[0].shape[0])[0]
         return self.purity['kmeans']
     else:
         for i in range(self.n_clusters):
             self.purity['kernelkmeans'][epoch,i,gamma,n_iter] =\
                 (mode(self.y[np.where(labels == i)])[1]/np.where(labels == i)[0].shape[0])[0]
         return self.purity['kernelkmeans'] 
Example #19
0
def camera_spot(i, D):
    # split the distance histogram at the elbow between its linear
    # region and its exponential region
    elbow = np.argmin(D-i)
    j, Dl = index_normalize(D[elbow:])
    k, Dr = index_normalize(D[:elbow])
    # now compute the mode count for each region and divide by the size
    # of the region. if either mode count is high, that likely indicates
    # a camera spot
    mcl = 1. * stats.mode(Dl)[1][0] / Dl.size
    mcr = 1. * stats.mode(Dr)[1][0] / Dr.size
    return max(mcl, mcr)
Example #20
0
def homogeneity(labels1, labels2):
    num_missed = 0.0
    for label in set(labels1):
        matches = labels2[labels1 == label]
        match_mode = mode(matches).mode[0]
        num_missed += np.sum(matches != match_mode)

    for label in set(labels2):
        matches = labels1[labels2 == label]
        match_mode = mode(matches).mode[0]
        num_missed += np.sum(matches != match_mode)

    return num_missed / 2.0
Example #21
0
    def compute(self, do_pdf='yes'):
        for i in xrange(self.nD):
            self.bigpdf = zeros(len(self.zbins))
            if self.dict_zp.has_key(i):
                out = array(self.dict_zp[i]['zp'])
                #wout=array(self.dict_zp[i]['wp'])
                if self.dict_zp[i].has_key('zs'): self.zs[i] = self.dict_zp[i]['zs']
                if self.Pars.predictionclass == 'Reg':
                    for zpi in xrange(len(out)):
                        mybin = int(floor(out[zpi] / self.resz))
                        if mybin > self.Nbins - 1: continue
                        self.bigpdf[mybin] += 1.
                    pdf = self.bigpdf
                    pdf2 = interp(self.zfine2, self.zbins, pdf)
                    pdf2 = where(greater(pdf2, max(pdf2) * 0.01), pdf2, 0.)
                    pdf2 = convolve(pdf2, self.gaus2, 1)
                    pdf2 = where(greater(pdf2, max(pdf2) * 0.005), pdf2, 0.)
                    if sum(pdf2) > 0.: pdf2 /= sum(pdf2)
                    self.zs0[i] = self.zfine2[argmax(pdf2)]
                    self.zs0[i] = min(self.zs0[i], self.Pars.maxz)
                    self.zs0[i] = max(self.zs0[i], self.Pars.minz)
                    self.zs1[i] = sum(self.zfine2 * pdf2)
                    self.zs1[i] = min(self.zs1[i], self.Pars.maxz)
                    self.zs1[i] = max(self.zs1[i], self.Pars.minz)
                    if do_pdf == 'yes':
                        self.err0[i] = utils_mlz.compute_error(self.zfine2, pdf2, self.zs0[i])
                        self.err1[i] = utils_mlz.compute_error(self.zfine2, pdf2, self.zs1[i])
                        self.zConf0[i] = utils_mlz.compute_zConf(self.zfine2, pdf2, self.zs0[i], self.Pars.rmsfactor)
                        self.zConf1[i] = utils_mlz.compute_zConf(self.zfine2, pdf2, self.zs1[i], self.Pars.rmsfactor)
                        pdf2 = pdf2[self.wzin]
                        if sum(pdf2) > 0.: pdf2 /= sum(pdf2)
                        self.bigpdf2[i, :] = pdf2
                if self.Pars.predictionclass == 'Class':
                    if len(out) > 0:
                        self.zs0[i] = mode(out * 1.)[0][0]
                        self.zs1[i] = mean(out * 1.)
                    if len(out) > 0.: self.err0[i] = mode(out * 1.)[1][0] * 1. / (len(out) * 1.)
                    self.err1[i] = std(out * 1.)

        bigZ = zeros((self.nD, 7))
        bigZ[:, 0] = self.zs
        bigZ[:, 1] = self.zs0
        bigZ[:, 2] = self.zs1
        bigZ[:, 3] = self.zConf0
        bigZ[:, 4] = self.zConf1
        bigZ[:, 5] = self.err0
        bigZ[:, 6] = self.err1
        if do_pdf == 'no':
            return bigZ
        else:
            return bigZ, self.bigpdf2
def getSlidingWindowModes(windowSize):
	global dataMax
	assert len(dataMax) > 0
	assert windowSize <= len(dataMax)
	modes = []
	nb_windows = len(dataMax) - windowSize + 1
	datalet = dataMax[:windowSize]
	modes.append(stats.mode(datalet)[0][0])
	for i in range(1, nb_windows):
		datalet.pop(0)
		datalet.append(dataMax[i])
		modes.append(stats.mode(datalet)[0][0])
	print "modes", modes
	return modes
Example #23
0
    def predict(self, entities, flag=False):  
        #if depth > 0, object is an entity list. otherwise, problem!!!!
        NA_VAL= -1
        if self.is_svm:
            
            transformed_obj= apply_transforms(self.relations, self.transforms, [entities]) 
            if flag:
                transformed_obj= apply_transforms_other(self.relations, self.transforms[-1:], [entities])
            self.table= zeros((len(transformed_obj), len(self.features)))
            for j,new_feature in enumerate(self.features):
                self.table[:, j]= array([new_feature(ent) for ent in transformed_obj])
            return int(mode(self.query_tree.predict(self.table))[0][0])
        curr_node= self.query_tree
        if curr_node.chosen_tag is None:#edge case in the case of consistent
            return 0#some arbitrary rule
        while curr_node.chosen_query is not None:            
            if len(curr_node.sons.keys())==1: #only one son
                curr_node=curr_node.sons[curr_node.sons.keys()[0]]
                continue
            
            transformed_obj= apply_transforms(curr_node.relations, curr_node.transforms, [entities]) 
            if flag:
                transformed_obj= apply_transforms_other(curr_node.relations, curr_node.transforms[-1:], [entities])
            query_val= None
            #print transformed_obj

            if len(transformed_obj[0])==0: #no entities
                query_val= NA_VAL
                if len(self.transforms)>0:
                    return NA_VAL
                #if not lvl0, return -1 for this
            elif not curr_node.is_rec:
                query_val= curr_node.chosen_query(transformed_obj[0])
            else: 
                vals=[]
                if len(self.transforms)==0: #need apply trans
                    vals= [curr_node.chosen_query([x]) for x in transformed_obj[0] if len(apply_transforms(curr_node.relations, curr_node.justify.transforms, [x])[0])>0]
                else:
                    vals= [curr_node.chosen_query([x]) for x in transformed_obj[0] if len(apply_transforms_other(curr_node.relations, curr_node.justify.transforms[-1:], [x])[0])>0]
                if len(vals)>0:
                    query_val= int(mode(vals)[0][0]) #ISSUE: mode is problem if equal...
                else:
                    query_val= NA_VAL #query for tree is -1
            
            tmp= int(curr_node.chosen_tag)
            curr_node=curr_node.sons.get(query_val)
            if curr_node is None: #tried tree that has no N/A in train, but does in test/ example was []
                return tmp #best possible guess
        return int(curr_node.chosen_tag)
 def classify(self, data):
     nan_cols = np.arange(self.n_features)[np.isnan(data)]
     decisions = []
     s1 = set(nan_cols)
     for i in range(self.n_trees):
         cols = self.col_list[i]
         s2 = set(cols)
         
         if len(s1.intersection(s2)) > 0:
             #decisions[i] = -1
             continue
         decisions.append(self.bags[i].predict(data[cols]))
     if (len(decisions) == 0):
         return (-1, 0, 0)
     return (mode(decisions)[0][0][0], mode(decisions)[1][0][0], len(decisions))
Example #25
0
    def aggregate_VOTE(self, cv_prediction_matrix, final_prediction_matrix, y_train, y_test):

        final_accuracy_test, final_accuracy_train_cv = 0, 0
        final_accuracy_classifier_index = -1

        cv_accuracies = [0]
        test_accuracies = []

        #claculate acccuracy for the first classifier
        test_accuracy_till_now = np.sum(final_prediction_matrix[0] + 1 == y_test)/float(len(y_test))
        test_accuracies.append(test_accuracy_till_now)

        for classifier_index in range(1, self.number_of_classifiers):

            cv_accuracy_till_now = 0
            cv = StratifiedKFold(y = y_train, n_folds = self.config.configuration["number_of_cv_folds"])
            cv_labels_till_now = mode(cv_prediction_matrix[0:classifier_index], axis = 0)[0][0]
            for train_index, test_index in cv:
                y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]

                cv_accuracy_till_now += np.sum(cv_labels_till_now[test_index] + 1 == y_test_cv)/float(len(y_test_cv))

            cv_accuracy_till_now /= float(self.config.configuration['number_of_cv_folds'])
            cv_accuracies.append(cv_accuracy_till_now)

            # print str(classifier_index) + " cv accuracy: " + str(cv_accuracy_till_now)
            test_labels_till_now = mode(final_prediction_matrix[0:classifier_index], axis = 0)[0][0]
            test_accuracy_till_now = np.sum(test_labels_till_now + 1 == y_test)/float(len(y_test))

            test_accuracies.append(test_accuracy_till_now)
            # print "test predictions till now: " + str(test_accuracy_till_now)

            #termination condition
            if classifier_index > self.window_size_for_termination:
                final_accuracy_test, final_accuracy_train_cv, final_accuracy_classifier_index = self.check_threshold(cv_accuracies, classifier_index,
                                                                                                                     test_accuracy_till_now, final_accuracy_test,
                                                                                                                     final_accuracy_train_cv, final_accuracy_classifier_index)
                # if final_accuracy_classifier_index != -1:
                #     break

        if final_accuracy_classifier_index == -1:
            final_accuracy_test, final_accuracy_train_cv = test_accuracies[self.number_of_classifiers - 1], cv_accuracies[self.number_of_classifiers - 1]
            final_accuracy_classifier_index = self.number_of_classifiers

        print self.bo_selection_type, self.subject, final_accuracy_test, final_accuracy_train_cv, final_accuracy_classifier_index
        with open('../n_classifiers.txt', 'a') as f:
            f.write(self.subject + self.bo_selection_type + str(final_accuracy_classifier_index) + '\n')
        return final_accuracy_test, final_accuracy_train_cv, final_accuracy_classifier_index, cv_accuracies, test_accuracies
Example #26
0
    def predict(self, X):
        """Predict the class labels for the provided data

        Parameters
        ----------
        X: array
            A 2-D array representing the test points.

        Returns
        -------
        labels: array
            List of class labels (one for each data sample).
        """
        X = atleast2d_or_csr(X)

        neigh_dist, neigh_ind = self.kneighbors(X)
        pred_labels = self._y[neigh_ind]

        weights = _get_weights(neigh_dist, self.weights)

        if weights is None:
            mode, _ = stats.mode(pred_labels, axis=1)
        else:
            mode, _ = weighted_mode(pred_labels, weights, axis=1)

        return self.classes_.take(mode.flatten().astype(np.int))
Example #27
0
def knnClassifier(training_data, test_data, training_target, test_target, k=5):
   #normalize the data
   #calculate the z-score of the data
   #print training_data
   training_data = training_data
   new_training_data = stats.zscore(training_data.astype(int), axis=0)
   new_test_data = stats.zscore(test_data.astype(int), axis=0)
   #find the k nearest neighbors for each test data
   #print 'test', new_test_data
   predictions = []
   for test in new_test_data:
      #print test
      # find the euclidean distance between the test case and all training cases
      distances = []
      neighbors = []
      neighbor_predictions = []
      for train in new_training_data:
         #print train
         distances.append(np.linalg.norm(train-test))
      #print distances
      for i in range(k):
         neighb_i = distances.index(min(distances))
         neighbors.append(neighb_i)
         distances[neighb_i] = 1000000
      #print neighbors
      for neighb in neighbors:
         neighbor_predictions.append(training_target[neighb])
      predictions.append(stats.mode(neighbor_predictions)[0][0])
   return predictions
Example #28
0
def calc_modes(N2, bottom_depth, z_bins):
    """Wave velocity and structure of first three modes"""

    dz = np.mean(np.diff(z_bins))

    # Truncate N2 to appropriate length based on depth and dz
    Nz = (bottom_depth/dz).astype(int)
    N2 = N2[:Nz]

    # Find indices of start and end of finite values
    finite_vals = nan_or_masked(N2) == 0
    labels = label(finite_vals)[0]
    main_data = np.where(labels == mode(labels[finite_vals]))[1]
    start_ind, end_ind = main_data[0], main_data[-1]

    # Fill in NaN values with start or end values
    N2[:start_ind] = N2[start_ind]
    N2[end_ind + 1:] = N2[end_ind]

    # Preallocate arrays for horizontal and vertical structure
    hori = np.full((len(z_bins) - 1, 3), np.nan)
    vert = hori.copy()

    hori[:len(N2), :], vert[:len(N2), :], c, _ = vertModes(N2, dz, 3)

    return hori, vert, c[:3]
Example #29
0
def select_downcast(pressure, for_overturn_calcs=False):
    """Find indices of the downcast part of data"""
    # Take the derivative of the pressure profile
    dp = np.diff(pressure)
    # Constants for the filter
    B, A = signal.butter(2, 0.01, output='ba')
    # Filter the pressure derivative, to smooth out the curve
    dp_smooth = signal.filtfilt(B, A, dp)
    # Make the arrays the same size
    dp_smooth = np.append(dp_smooth, [0])
    # Find the indices where the descent rate is more than 0.05
    falling_inds = dp_smooth > 0.05

    if for_overturn_calcs:
        # For overturns, we want fall to be smooth.
        # Therefore, we want to exclude portions near the surface where
        # fall rate may drop below 0.05. In such cases without the code below
        # we would end up with discontinuous pieces of the profile
        inds_label = label(falling_inds)[0]
        inds_label_mode = mode(inds_label[falling_inds])[0]
        falling_inds[inds_label != inds_label_mode] = False

    falling_inds = np.where(falling_inds)[0]

    return falling_inds
def build_tree(data, labels, word_data, level):
    if (level == 0):
        #return label value which is dominant
        return LabelConv[st.mode(labels)[0][0]-1];
    #select appropriate attribute for the node:
    best, best_ig = attribute_selection(data,labels);
    best_data = data[:,best]; best_word = word_data[best];
    #remove all regarding that attribute from the data:
    word_data = np.delete(word_data,best,0);
    left_data = np.delete(data[best_data == 0,:],best,1); 
    right_data = np.delete(data[best_data == 1,:],best,1);
    #divide labels into two subarray based on selected attribute:
    left_labl = labels[best_data == 0]; 
    right_labl = labels[best_data == 1];
    if (check_label(left_labl) == 2 and level != 0):
        #since label is mono-valued:
        left = LabelConv[left_labl[0]-1];
    else:
        left = build_tree(left_data,left_labl,word_data,level-1);
    if (check_label(right_labl) == 2 and level != 0):
        #since label is mono-valued:
        right = LabelConv[right_labl[0]-1];
    else:
        right = build_tree(right_data,right_labl,word_data,level-1);
    subtrees = {0: left, 1: right};
    return (best_word,best_ig,subtrees);
Example #31
0
# Add a new column to the existing DataFrame with the encoded values
df[LABEL] = le.fit_transform(df['label'].values.ravel())

RANDOM_SEED = 50

N_TIME_STEPS = 200
N_FEATURES = 2
classes = 4
step = 1
segments = []
labels = []
for i in range(1, len(df) - N_TIME_STEPS, step):
    x1 = df['value a'].values[i:i + N_TIME_STEPS]
    x2 = df['value b'].values[i:i + N_TIME_STEPS]

    label = stats.mode(df['label'][i:i + N_TIME_STEPS])[0][0]
    segments.append([x1, x2])
    labels.append(label)

reshaped_segments = np.asarray(segments, dtype=np.float32).reshape(
    -1, N_TIME_STEPS, N_FEATURES)
labels = np.asarray(pd.get_dummies(labels), dtype=np.float32)

X_train, X_test, y_train, y_test = train_test_split(reshaped_segments,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=RANDOM_SEED)

print('x_train shape: ', X_train.shape)
print(X_train.shape[0], 'training samples')
print('y_train shape: ', y_train.shape)
Example #32
0
combined_data['Item_Visibility_MeanRatio'] = combined_data.apply(lambda x: x[
    'Item_Visibility'] / item_visibility_mean.loc[x['Item_Identifier']],
                                                                 axis=1)

# step 5
#       For Year_Of_Establishment add new variable NoOfYears. And drop Year_Of_Establishment.
combined_data[
    'Outlet_Years'] = 2018 - combined_data['Outlet_Establishment_Year']

# step 6
#       Find missing values for outlet size by mode.
outlet_size_mode = combined_data.dropna(subset=["Outlet_Size"]).pivot_table(
    values='Outlet_Size',
    columns='Outlet_Type',
    aggfunc=(lambda x: mode(x).mode[0]))
miss_bool = combined_data['Outlet_Size'].isnull()
sum(miss_bool)
combined_data.loc[miss_bool, 'Outlet_Size'] = combined_data.loc[
    miss_bool, 'Outlet_Type'].apply(lambda x: outlet_size_mode[x])
sum(combined_data['Outlet_Size'].isnull())

# step 7
#       Item Type handling.
combined_data['Item_Type_Combined'] = combined_data['Item_Identifier'].apply(
    lambda x: x[0:2])
combined_data['Item_Type_Combined'] = combined_data['Item_Type_Combined'].map({
    'FD':
    'Food',
    'DR':
    'Drinks',
Example #33
0
def levelPlot(data,
              var=None,
              time=None,
              levels=(3, 5),
              target=None,
              colors=None,
              **kwargs):
    """
    Draw a step-plot with up to 5 levels following a color cycle (e.g. Kp index "stoplight")

    Parameters
    ----------
    data : array-like, or dict-like
        Data for plotting. If dict-like, the key providing an array-like 
        to plot must be given to var keyword argument.

    Other Parameters
    ----------------
    var    : string
        Name of key in dict-like input that contains data
    time   : array-like or string
        Name of key in dict-like that contains time, or arraylike of datetimes
    levels : array-like, up to 5 levels
        Breaks between levels in data that should be shown as distinct colors
    target : figure or axes
        Target axes or figure window
    colors : array-like
        Colors to use for the color sequence (if insufficient colors, will use as a cycle)
    **kwargs : other keywords
        Other keywords to pass to spacepy.toolbox.binHisto

    Returns
    -------
    binned : tuple
        Tuple of the binned data and bins

    Examples
    --------
    >>> import spacepy.plot as splot
    >>> import spacepy.time as spt
    >>> import spacepy.omni as om
    >>> tt = spt.tickrange('2012/09/28','2012/10/2', 3/24.)
    >>> omni = om.get_omni(tt)
    >>> splot.levelPlot(omni, var='Kp', time='UTC', colors=['seagreen', 'orange', 'crimson'])
    """
    #assume dict-like/key-access, before moving to array-like
    if var is not None:
        try:
            usearr = data[var]
        except KeyError:
            raise KeyError('Key "{1}" not present in data'.format(var))
    else:
        #var is None, so make sure we don't have a dict-like
        if not isinstance(data, Mapping):
            usearr = np.asarray(data)
        else:
            raise TypeError(
                'Data appears to be dict-like without a key being given')
    tflag = False
    if time is not None:
        from scipy.stats import mode
        try:
            times = data[time]
        except (KeyError, ValueError, IndexError):
            times = time
        try:
            times = matplotlib.dates.date2num(times)
            tflag = True
        except AttributeError:
            #the x-data are a non-datetime
            times = np.asarray(time)
        #now add the end-point
        stepsize, dum = mode(np.diff(times), axis=None)
        times = np.hstack([times, times[-1] + stepsize])
    else:
        times = np.asarray(range(0, len(usearr) + 1))
    if not colors:
        if len(levels) <= 3:
            #traffic light colours that are distinct to protanopes and deuteranopes
            colors = ['lime', 'yellow', 'crimson', 'saddlebrown']
        else:
            colors = matplotlib.rcParams['axes.color_cycle']
    else:
        try:
            assert len(colors) > len(levels)
        except AssertionError:
            #cycle the given colors, if not enough are given
            colors = list(colors) * int(1 + len(levels) / len(colors))
    if 'alpha' not in kwargs:
        kwargs['alpha'] = 0.75
    if 'legend' not in kwargs:
        legend = False
    else:
        legend = kwargs['legend']
        del kwargs['legend']
    fig, ax = set_target(target)
    subset = np.asarray(dmcopy(usearr))

    def fill_between_steps(ax, x, y1, **kwargs):
        y2 = np.zeros_like(y1)
        stepsxx = x.repeat(2)[1:-1]
        stepsyy = y1.repeat(2)
        y2 = np.zeros_like(stepsyy)
        ax.fill_between(stepsxx, stepsyy, y2, **kwargs)
        if mpl.__version__ < '1.5.0':
            #pre-v1.5.0, need to manually add an artist for the legend
            p = plt.Rectangle((0, 0), 0, 0, **kwargs)
            ax.add_patch(p)

    #below threshold 1
    idx = 0
    inds = usearr > levels[0]
    subset[inds] = np.nan
    kwargs['label'] = u'≤{0}'.format(levels[idx])
    fill_between_steps(ax, times, subset, color=colors[0], zorder=30, **kwargs)
    #for each of the "between" thresholds
    for idx in range(1, len(levels)):
        subset = np.asarray(dmcopy(usearr))
        inds = np.bitwise_or(usearr <= levels[idx - 1], usearr > levels[idx])
        subset[inds] = np.nan
        kwargs['label'] = u'>{0},≤{1}'.format(levels[idx - 1], levels[idx])
        fill_between_steps(ax,
                           times,
                           subset,
                           color=colors[idx],
                           zorder=30 - (idx * 2),
                           **kwargs)
    #last
    idx += 1
    try:
        inds = usearr <= levels[idx - 1]
        subset = np.asarray(dmcopy(usearr))
        subset[inds] = np.nan
        kwargs['label'] = '>{0}'.format(levels[-1])
        fill_between_steps(ax,
                           times,
                           subset,
                           color=colors[idx],
                           zorder=30 - (idx * 2),
                           **kwargs)
    except:
        pass

    #if required, set x axis to times
    if tflag:
        try:
            applySmartTimeTicks(ax, data[time])
        except (IndexError, KeyError):
            #using data array to index, so should just use time
            applySmartTimeTicks(ax, time)
        ax.grid(False,
                which='minor')  #minor grid usually looks bad on these...

    if legend:
        ncols = len(levels) + 1
        if ncols > 3: ncols = ncols // 2
        ax.legend(loc='upper left', ncol=ncols)

    return ax
def OnevsOne(dataset, temp_indices, train_dataset, test_dataset,
             learning_parameter, num_classes, weights):

    y = (dataset[:, 7]).astype(int)
    new_models = [[] for i in range(num_classes)]

    X_train = train_dataset[:, :7] / np.max(train_dataset[:, :7], 0)
    y_train = train_dataset[:, 7].astype(int)
    X_test = test_dataset[:, :7] / np.max(test_dataset[:, :7], 0)
    y_actual = test_dataset[:, 7].astype(int)

    labels = np.unique(y).astype('str')
    new_models = [[] for i in range(int(num_classes * (num_classes - 1) / 2))]
    binary_class_models = [
        [] for i in range(int(num_classes * (num_classes - 1) / 2))
    ]
    binary_class_labels = [
        [] for i in range(int(num_classes * (num_classes - 1) / 2))
    ]

    for i in range(len(new_models)):
        new_models[i] = np.where(y == i + 1, 1, 0)

    i = 0
    for p in range(1, num_classes):
        for q in range(p):
            binary_class_labels[i] = labels[q] + labels[p]
            binary_class_models[i] = np.vstack((new_models[q], new_models[p]))
            i += 1

    binary_class_models = [model[1] for model in binary_class_models]

    new_models = binary_class_models
    predictions = [[] for i in range(num_classes)]
    probabilities = [[] for i in range(num_classes)]
    class_predictions = [[] for i in range(num_classes)]

    y_pred = []

    for pred in range(len(predictions)):

        y_test = new_models[pred][temp_indices]
        y_train = np.delete(new_models[pred], temp_indices, axis=0)
        #print("Binary Class: ", binary_class_labels[pred])
        weights = np.random.rand(7)

        for i in range(5000):
            weights = update(X_train, weights, y_train, learning_parameter)

        class_labels = list(map(int, binary_class_labels[pred]))
        probabilities[pred] = sigmoid(
            np.sum(np.multiply(X_test, weights), axis=1))
        predictions[pred] = np.heaviside((probabilities[pred] - 0.5),
                                         0).astype(int)
        #print("Accuracy for Class",binary_class_labels[pred],": ", accuracy_score(y_test, predictions[pred]),"\n")
        class_predictions[pred] = [
            class_labels[label] for label in predictions[pred]
        ]

    y_pred = stats.mode(class_predictions)[0][-1]
    print("Individual Accuracy: ", accuracy_score(y_actual, y_pred), '\n')
    #print(confusion_matrix(y_actual, y_pred))
    acc_score = accuracy_score(y_actual, y_pred) * len(temp_indices)

    return weights, acc_score
Example #35
0
def calc_staff_font_info(df_0):
    needs_postprocessing = False
    for index_0 in range(len(df_0)):
        height, width = int(df_0[index_0]['height'].max()), int(
            df_0[index_0]['width'].max())
        container_font_info = {
            'pixel_mean': [],
            'delta_line': [],
            'pass_count': [],
            'kind': []
        }
        for index_1 in range(df_0[index_0].shape[0]):
            info = df_0[index_0].iloc[index_1:, :].copy()
            template = img[info['y'].values[0]:info['y'].values[0] + height,
                           info['x'].values[0]:info['x'].values[0] + width]
            if width % 2 != 1:
                width = width - 1
            if height % 2 != 1:
                height = height - 1
            template_blr = cv2.GaussianBlur(template, (width, 1), 0)
            th, template_th = cv2.threshold(
                template_blr, int((np.mean(template_blr)) * 0.75), 255,
                cv2.THRESH_BINARY_INV)

            for index_2 in range(2):
                template_open = cv2.morphologyEx(template_th, cv2.MORPH_OPEN,
                                                 np.ones((1, 75), np.uint8))
                template_close = cv2.morphologyEx(template_open,
                                                  cv2.MORPH_CLOSE,
                                                  np.ones((5, 1), np.uint8))
                template_th = template_close


#            cv2.imshow('template',template_th)
#            cv2.waitKey(0)
#            cv2.destroyAllWindows()

#            cv2.imshow('template_open',template_open)
#            cv2.waitKey(0)
#            cv2.destroyAllWindows()
#
#            cv2.imshow('template_closed',template_close)
#            cv2.waitKey(0)
#            cv2.destroyAllWindows()

            df_1 = pd.DataFrame(
                data={
                    'row_0': template_close[:, 0].copy(),
                    'row_1': template_close[:, -1].copy()
                })
            df_1 = df_1.divide(2)
            df_1['sum'] = df_1['row_0'].add(df_1['row_1'])
            df_1 = df_1.loc[df_1['sum'] > 200]
            df_1['numrow'] = df_1.index.tolist()
            df_1['delta_p'] = df_1['numrow'].diff().shift(-1).fillna(2)
            #            print(df_1)
            df_1 = df_1.loc[df_1['delta_p'] > 5]
            df_1 = df_1.reset_index(drop=True)
            for index_2 in range(df_1.shape[0]):
                if df_1['delta_p'].min() / df_1['delta_p'].mean() < 0.66:
                    hold_value, hold_index = df_1['delta_p'].min(
                    ), df_1['delta_p'].idxmin()
                    if hold_index != 0 and hold_index != df_1.index.tolist(
                    )[-1]:
                        val_0, val_1 = df_1['delta_p'].values[(
                            hold_index -
                            1)], df_1['delta_p'].values[(hold_index + 1)]
                        if val_0 < val_1:
                            hold_index = hold_index - 1
                        else:
                            hold_index = hold_index + 1

                    elif hold_index == 0:
                        hold_index = 1
                    elif hold_index == df_1.index.tolist()[-1]:
                        hold_index = hold_index - 1

                    if np.abs(hold_value + df_1['delta_p'].values[hold_index] -
                              df_1['delta_p'].max()) < 6:
                        df_1.loc[df_1['numrow'] ==
                                 df_1['numrow'].values[hold_index],
                                 'delta_p'] = df_1['delta_p'].values[
                                     hold_index] + hold_value

                    df_1 = df_1.loc[df_1['delta_p'] > df_1['delta_p'].min()]
                    df_1 = df_1.reset_index(drop=True)
                else:
                    break
            df_1['delta_line'] = df_1['delta_p'].tolist()
            template_a = img[info['y'].values[0]:int(info['y'].values[0] +
                                                     template.shape[0]),
                             info['x'].values[0]:int(info['x'].values[0] +
                                                     info['width'].values[0])]
            template_blr_a = cv2.GaussianBlur(template_a, (9, 1), 0)
            th, template_th_a = cv2.threshold(
                template_blr_a,
                int((np.min(template_a) + (255 - np.mean(template_blr_a))) *
                    1.3), 255, cv2.THRESH_BINARY_INV)
            container_font_info['pixel_mean'].append(np.mean(template_th_a))

            if np.abs(df_1['delta_line'].mean() -
                      df_1['delta_line'].mode()[0]) > 2.5:
                container_font_info['delta_line'].append(
                    df_1['delta_line'].tolist())
                needs_postprocessing = True
            else:
                container_font_info['delta_line'].append(
                    int(df_1['delta_line'].mean()))

        df_0[index_0]['delta_line'] = container_font_info['delta_line']
        df_0[index_0]['pixel_mean'] = container_font_info['pixel_mean']

    container_postprocessing_index = []
    if needs_postprocessing == True:
        for index_0 in range(len(df_0)):
            df_2 = df_0[index_0].copy()
            df_2 = df_2.reset_index(drop=True)
            for index, row in df_2.iterrows():
                if type(row['delta_line']) == list:
                    container_postprocessing_index.append(
                        [row['pass_count'], index, index_0, []])

    for data in container_postprocessing_index:
        df_3 = df_0[data[2]].loc[df_0[data[2]]['pass_count'] == data[0]].copy()
        for df_temp in df_0:
            df_4 = df_temp.loc[df_temp['pass_count'] == data[0]].copy()
            for val in df_4['delta_line'].values:
                if type(val) != list:
                    data[3].append(val)
        if len(data[3]) > 0:
            df_0[data[2]]['delta_line'].values[data[1]] = np.mean(data[3])
        else:
            df_0[data[2]]['delta_line'].values[data[1]] = -1
    mean = 0
    for index_0 in range(len(df_0)):
        mean = (mean + df_0[index_0]['delta_line'].mean())
    mean = mean / len(df_0)
    for index_0 in range(len(df_0)):
        df_0[index_0].loc[(df_0[index_0]['delta_line'] < 0),
                          'delta_line'] = mean
    container_delta_line = np.array([], dtype=np.uint8)
    for df_temp in df_0:
        container_delta_line = np.append(container_delta_line,
                                         df_temp['delta_line'].values)
    if container_delta_line.max() != container_delta_line.min(
    ) and container_delta_line.max() - container_delta_line.min() < 6:
        for df_temp in df_0:
            df_temp['delta_line'] = [stats.mode(container_delta_line)[0][0]
                                     ] * df_temp.shape[0]

    for df_temp in df_0:
        df_temp['font_scaling'] = df_temp['delta_line'].divide(
            container_delta_line.mean())

    return df_0
Example #36
0
    def __init__(self, labelimg_list, brainimg, bounding_boxes=None, beta=-.2, mixing_ratio=10,
                 patch_length=5, same_threshold=True, thresholds=[0.8, 0.6]): #use same defaults as the parser
                
        def positive_int(x): #avoid nonsense negative parameter values   
            x = int(x)
            if x < 0:
                raise AssertionError("%r is not a positive int"%(x,))
            return x
            
        def restricted_float(x): #avoid nonsense values for the threshold
            x = float(x)
            if x < 0.0 or x > 1.0:
                raise AssertionError("%r not in range [0.0, 1.0]"%(x,))
            return x
            
        #catch invalid parameters
        self.beta = float(beta)        
        self.mixing_ratio = positive_int(mixing_ratio)
        self.patch_length = positive_int(patch_length)
        for threshold in thresholds:
            threshold = restricted_float(threshold)
        
        self.bounding_box = [] #get the final bounding box for AWoL-MRF
        self.bounding_box.append(np.amin(bounding_boxes[:, :3], axis=0) - self.patch_length) #min indices
        self.bounding_box.append(np.amax(bounding_boxes[:, :3] + bounding_boxes[:, 3:], axis=0) + self.patch_length)
        self.bounding_box.append(self.bounding_box[1] - self.bounding_box[0]) #dimensions
        
        #get the bounded label arrays with bounding boxes     
        volhandles = []        
        nimg = len(labelimg_list)
        for n, img in enumerate(labelimg_list):
            xmin = bounding_boxes[n][0] - self.bounding_box[0][0] #find the bounded indices for this image
            ymin = bounding_boxes[n][1] - self.bounding_box[0][1]
            zmin = bounding_boxes[n][2] - self.bounding_box[0][2]
            xmax = xmin + bounding_boxes[n][3] 
            ymax = ymin + bounding_boxes[n][4]
            zmax = zmin + bounding_boxes[n][5]
            
            #get the bounded label array
            label_array = np.zeros((self.bounding_box[2][2], self.bounding_box[2][1], self.bounding_box[2][0]))            
            label_array[zmin:zmax, ymin:ymax, xmin:xmax] = sitk.GetArrayFromImage(img)
            
            if len(volhandles) == 0:
                self.label_values = np.unique(label_array) #obtain the list of labels
            elif np.asarray(np.unique(label_array) != self.label_values).any(): #each image should have the same labels
                raise AssertionError("Labels in {0} not the same as in {1}.".format(img, labelimg_list[0]))
            
            volhandles.append(label_array)
        
        if len(self.label_values) != len(thresholds):
            if not(same_threshold):
                raise AssertionError("Number of labels does not match number of thresholds.")
            else:
                while len(thresholds) < len(self.label_values):
                    thresholds.append(thresholds[-1]) #same threshold for each structural label
        
        self.mode = stats.mode(volhandles) #find the majority votes
        self.labels = np.zeros(volhandles[0].shape) - 1 #array of labels, -1 is for low-confidence voxels
        self.intensity = sitk.GetArrayFromImage(brainimg[self.bounding_box[0][0]:self.bounding_box[1][0], 
                                                         self.bounding_box[0][1]:self.bounding_box[1][1],
                                                         self.bounding_box[0][2]:self.bounding_box[1][2]])

        #find the high-confidence voxels for each label
        for i, l in enumerate(self.label_values.tolist()): 
            above_threshold = np.where((self.mode[0][0] == l) & (self.mode[1][0] >= thresholds[i]*nimg))
            below_threshold = np.where((self.mode[0][0] == l) & (self.mode[1][0] < thresholds[i]*nimg))
            print(above_threshold[0].shape, below_threshold[0].shape)
            
            #threshold reduction if necessary
            while below_threshold[0].shape > above_threshold[0].shape and thresholds[i] > .55:
                thresholds[i] -= .05                
                above_threshold = np.where((self.mode[0][0] == l) & (self.mode[1][0] >= thresholds[i]*nimg))
                below_threshold = np.where((self.mode[0][0] == l) & (self.mode[1][0] < thresholds[i]*nimg))
                
            self.labels[above_threshold] = l
            
        self.brainimg = brainimg #keep this to copy the metadata to the output image
Example #37
0
 def identity(self):
     """Return the average predicted identity of all Tracklet detections."""
     try:
         return mode(self.data[..., 3], axis=None, nan_policy='omit')[0][0]
     except IndexError:
         return -1
def computePercentOfChangeDistributionForAllNamads(
        OutputDir="Distiribution", InputFile="AllNamadsByNamads.pkl"):
    if not os.path.exists(OutputDir):
        os.makedirs(OutputDir)

    f = open(InputFile, "rb")
    allData = pickle.load(f)
    f.close()

    print('start writing results for ' + str(allData.__len__()) + ' Namad')

    pr = 0
    GroupByNamad = {}
    for Namad in allData:
        NamadData = allData[Namad]

        if Namad not in GroupByNamad:
            GroupByNamad[Namad] = []

        GroupByMonth = {}
        for val in NamadData:
            DayData = NamadData[val]
            day = DayData['تاريخ']
            key = f'{day.year:02}' + '-' + f'{day.month:02}'
            if key not in GroupByMonth:
                GroupByMonth[key] = []

            try:
                Name = DayData['نام']
            except KeyError:
                Name = Namad
            # Value = DayData['ارزش']
            # Volume = DayData['حجم']
            Maximum = DayData['بیشترین']
            Minimum = DayData['کمترین']
            ExchangeCount = DayData['دفعات معامله']
            ClosePrice = DayData['مقدار قیمت پایانی']
            # taqirqeymatpayani = DayData[
            #     'تغییر قیمت پایانی']  # «قیمت پایانی» برابر با میانگین وزنی قیمت‌های معامله‌شده در همان روز است.
            PercentOfClosePrice = DayData[
                'درصد قیمت پایانی']  # میانگین قیمت سهم در روز
            LastPrice = DayData[
                'مقدار آخرین قیمت']  # «قیمت آخرین معامله» برابر است با آخرین قیمتی که تا آن لحظه معامله شده است.
            # taqirakharinqeymat = DayData['تغییر آخرین قیمت']
            PercentOfLastPrice = DayData[
                'درصد آخرین قیمت']  # آخرین قیمت معامله شده
            # PriceOfPreDay = DayData['قیمت روز قبل']
            ValueOfBazzar = DayData['ارزش بازار']  # ارزش کل سهام های نماد

            GroupByMonth[key].append(float(PercentOfClosePrice))

        HistOfMonth = {}
        for m in sorted(GroupByMonth.keys()):
            a = np.asarray(GroupByMonth[m])
            hist, bin_edges = np.histogram(a, density=True)
            # _counts = Counter(a)

            # plt.hist(a, 'auto)
            # reshaped_text = arabic_reshaper.reshape(Namad)
            # text = bidi.algorithm.get_display(reshaped_text)
            # plt.suptitle(text + ' > '+str(m))

            # plt.show()
            average = np.average(a)
            median = np.median(a)
            mode = stats.mode(a)
            std = np.std(a)
            HistOfMonth[m] = {
                'hist': hist,
                'bin': bin_edges,
                'avg': average,
                'median': median,
                'mode': mode,
                'std': std
            }

        GroupByNamad[Namad] = HistOfMonth

    f = open(OutputDir + '/PercentOfChangeDistributionForAllNamads.pkl', "wb")
    pickle.dump(allData, f)
    f.close()
def computePercentOfChangeDistributionForAllNamadsAsWhole(
        OutputDir="Distiribution", InputFile="AllData.pkl"):
    if not os.path.exists(OutputDir):
        os.makedirs(OutputDir)

    f = open(InputFile, "rb")
    allData = pickle.load(f)
    f.close()

    print('start writing results for ' + str(allData.__len__()) + ' day')

    GroupByMonth = {}
    for day in allData:
        DayData = allData[day]
        key = f'{day.year:02}' + '-' + f'{day.month:02}'
        if key not in GroupByMonth:
            GroupByMonth[key] = []

        for Namad in DayData:
            NamadData = DayData[Namad]

            try:
                Name = NamadData['نام']
            except KeyError:
                Name = day
            # Value = NamadData['ارزش']
            # Volume = NamadData['حجم']
            Maximum = NamadData['بیشترین']
            Minimum = NamadData['کمترین']
            ExchangeCount = NamadData['دفعات معامله']
            ClosePrice = NamadData['مقدار قیمت پایانی']
            # taqirqeymatpayani = DayData[
            #     'تغییر قیمت پایانی']  # «قیمت پایانی» برابر با میانگین وزنی قیمت‌های معامله‌شده در همان روز است.
            PercentOfClosePrice = NamadData[
                'درصد قیمت پایانی']  # میانگین قیمت سهم در روز
            LastPrice = NamadData[
                'مقدار آخرین قیمت']  # «قیمت آخرین معامله» برابر است با آخرین قیمتی که تا آن لحظه معامله شده است.
            # taqirakharinqeymat = NamadData['تغییر آخرین قیمت']
            PercentOfLastPrice = NamadData[
                'درصد آخرین قیمت']  # آخرین قیمت معامله شده
            # PriceOfPreDay = NamadData['قیمت روز قبل']
            ValueOfBazzar = NamadData['ارزش بازار']  # ارزش کل سهام های نماد

            GroupByMonth[key].append(float(PercentOfClosePrice))

    HistOfMonth = {}
    for m in sorted(GroupByMonth.keys()):
        a = np.asarray(GroupByMonth[m])
        hist, bin_edges = np.histogram(a, density=True)
        # _counts = Counter(a)

        # plt.hist(a, bin_edges)
        # plt.suptitle(str(m))

        # plt.show()
        HistOfMonth[m] = {
            'hist': hist,
            'bin': bin_edges,
            'avg': np.average(a),
            'median': np.median(a),
            'mode': stats.mode(a)
        }

    f = open(OutputDir + '/PercentOfChangeDistributionForAllNamadsAsWhole.pkl',
             "wb")
    pickle.dump(allData, f)
    f.close()
Example #40
0
def dmg_seed_50_1D(colnum):

    #INITIALIZING STUFF
    Nmitral = 50
    Ngranule = np.copy(Nmitral)  #number of granule cells     pg. 383 of Li/Hop
    Ndim = Nmitral + Ngranule  #total number of cells
    #    t_inh = 25 ; # time when inhalation starts
    #    t_exh = 205; #time when exhalation starts

    #    Ndamagetotal = Nmitral*2 + 1  #number of damage steps
    Ndamage = 3  #steps to reduce entire matrix to zero
    Ncols = int(Nmitral / 2)  #define number of columns to damage

    finalt = 395
    # end time of the cycle

    #y = zeros(ndim,1);

    P_odor0 = np.zeros((Nmitral, 1))  #odor pattern, no odor
    P_odor1 = P_odor0 + .00429  #Odor pattern 1
    #    P_odor2 = 1/70*np.array([.6,.5,.5,.5,.3,.6,.4,.5,.5,.5])
    #    P_odor3 = 4/700*np.array([.7,.8,.5,1.2,.7,1.2,.8,.7,.8,.8])
    #control_odor = control_order + .00429

    #control_odor = np.zeros((Nmitral,1)) #odor input for adaptation

    #controllevel = 1 #1 is full adaptation

    H0 = np.zeros((Nmitral, Ngranule))  #weight matrix: to mitral from granule
    W0 = np.zeros((Ngranule, Nmitral))  #weights: to granule from mitral

    H0 = np.load('H0_50_53Hz.npy')  #load weight matrix

    W0 = np.load('W0_50_53Hz.npy')  #load weight matrix

    #H0 = H0 + H0*np.random.rand(np.shape(H0))
    #W0 = W0+W0*np.random.rand(np.shape(W0))

    M = 5  #average over 5 trials for each level of damage

    #initialize iterative variables
    d1it, d2it, d3it, d4it = np.zeros(M), np.zeros(M), np.zeros(M), np.zeros(M)
    IPRit, IPR2it, pnit = np.zeros(M), np.zeros(M), np.zeros(M)
    frequencyit = np.zeros(M)
    pwrit = np.zeros(M)
    yout2, Sh2 = np.zeros((finalt, Ndim)), np.zeros((finalt, Ndim))
    psi = np.copy(Sh2[:, :Nmitral])

    #initialize quantities to be returned at end of the process
    dmgpct1 = np.zeros(Ncols * (Ndamage - 1) + 1)
    eigfreq1 = np.zeros(Ncols * (Ndamage - 1) + 1)
    d11 = np.zeros(Ncols * (Ndamage - 1) + 1)
    d21 = np.zeros(Ncols * (Ndamage - 1) + 1)
    d31 = np.zeros(Ncols * (Ndamage - 1) + 1)
    d41 = np.zeros(Ncols * (Ndamage - 1) + 1)
    pwr1 = np.zeros(Ncols * (Ndamage - 1) + 1)
    IPR1 = np.zeros(Ncols * (Ndamage - 1) + 1)
    IPR2 = np.zeros(Ncols * (Ndamage - 1) + 1)
    pn1 = np.zeros(Ncols * (Ndamage - 1) + 1)
    freq1 = np.zeros(Ncols * (Ndamage - 1) + 1)
    cell_act = np.zeros((finalt, Ndim, Ncols * (Ndamage - 1) + 1))

    damage = 0
    dam = np.ones(Nmitral)

    #Get the base response first
    Omean1,Oosci1,Omeanbar1,Ooscibar1 = np.zeros((Nmitral,M))+0j,\
                np.zeros((Nmitral,M))+0j,np.zeros(M)+0j,np.zeros(M)+0j
    for m in np.arange(M):
        yout,y0out,Sh,t,OsciAmp1,Omean1[:,m],Oosci1[:,m],Omeanbar1[m],\
            Ooscibar1[m],freq0,maxlam = olf_bulb_10(Nmitral,H0,W0,P_odor1,dam)

    counter = 0  #to get the right index for each of the measures
    damage = 0
    dam[colnum] += .5  # so that first run is for zero damage
    for col in range(Ncols):
        cols = int(np.mod(colnum + col, Nmitral))
        for lv in np.arange(Ndamage):
            #reinitialize all iterative variables to zero (really only need to do for distance measures, but good habit)
            d1it, d2it, d3it, d4it = np.zeros(M), np.zeros(M), np.zeros(
                M), np.zeros(M)
            IPRit, IPR2it, pnit = np.zeros(M), np.zeros(M), np.zeros(M)
            frequencyit = np.zeros(M)
            pwrit = np.zeros(M)
            if not (
                    lv == 0 and cols != colnum
            ):  #if it's the 0th level for any but the original col, skip
                dam[cols] = dam[cols] - .5
                dam[dam < 1e-10] = 0
                damage = np.sum(1 - dam)

                for m in np.arange(M):
                    #Then get respons of damaged network
                    yout2[:,:],y0out2,Sh2[:,:],t2,OsciAmp2,Omean2,Oosci2,Omeanbar2,\
                    Ooscibar2,freq2,grow_eigs2 = olf_bulb_10(Nmitral,H0,W0,P_odor1,dam)
                    #calculate distance measures
                    print(time.time() - tm1)
                    for i in np.arange(M):
                        d1it[m] += 1 - Omean1[:, m].dot(Omean2) / (
                            lin.norm(Omean1[:, m]) * lin.norm(Omean2))
                        d2it[m] += 1 - lin.norm(Oosci1[:, m].dot(
                            np.conjugate(Oosci2))) / (lin.norm(Oosci1[:, m]) *
                                                      lin.norm(Oosci2))
                        d3it[m] += (Omeanbar1[m] - Omeanbar2) / (Omeanbar1[m] +
                                                                 Omeanbar2)
                        d4it[m] += np.real((Ooscibar1[m] - Ooscibar2) /
                                           (Ooscibar1[m] + Ooscibar2))

                    d1it[m] = d1it[
                        m] / M  #average over comparison with all control trials
                    d2it[m] = d2it[m] / M
                    d3it[m] = d3it[m] / M
                    d4it[m] = d4it[m] / M

                    #calculate spectral density and "wave function" to get average power and IPR
                    P_den = np.zeros(
                        (501,
                         Nmitral))  #only calculate the spectral density from
                    for i in np.arange(
                            Nmitral
                    ):  #t=125 to t=250, during the main oscillations
                        f, P_den[:, i] = signal.periodogram(Sh2[125:250, i],
                                                            nfft=1000,
                                                            fs=1000)
                    psi = np.zeros(Nmitral)
                    for p in np.arange(Nmitral):
                        psi[p] = np.sum(P_den[:, p])
                    psi = psi / np.sqrt(np.sum(psi**2))

                    psi2 = np.copy(OsciAmp2)
                    psi2 = psi2 / np.sqrt(np.sum(psi2**2))

                    maxAmp = np.max(OsciAmp2)
                    pnit[m] = len(OsciAmp2[OsciAmp2 > maxAmp / 2])

                    IPRit[m] = 1 / np.sum(psi**4)
                    IPR2it[m] = 1 / np.sum(psi2**4)
                    pwrit[m] = np.sum(P_den) / Nmitral

                    #get the frequency according to the adiabatic analysis
                    maxargs = np.argmax(P_den, axis=0)
                    argf = stats.mode(maxargs[maxargs != 0])
                    frequencyit[m] = f[argf[0][0]]
        #            print(cols)
        #            print(time.time()-tm1)
        #
        #        print('level',lv)
        #Get the returned variables for each level of damage
                dmgpct1[counter] = damage / Nmitral
                IPR1[counter] = np.average(IPRit)  #Had to do 1D list, so
                pwr1[counter] = np.average(
                    pwrit)  #it goes column 0 damage counterl
                freq1[counter] = np.average(
                    frequencyit)  #0,1,2,3,4...Ndamage-1, then
                #col 1 damage level 0,1,2...
                #        IPRsd[counter]=np.std(IPRit)
                #        pwrsd[counter]=np.std(pwrit)
                #        freqsd[counter]=np.std(frequencyit)
                IPR2[counter] = np.average(IPR2it)
                pn1[counter] = np.average(pnit)

                d11[counter] = np.average(d1it)
                d21[counter] = np.average(d2it)
                d31[counter] = np.average(d3it)
                d41[counter] = np.average(d4it)
                #        d1sd[counter] =  np.std(d1it)
                #        d2sd[counter] = np.std(d2it)
                #        d3sd[counter]=np.std(d3it)
                #        d4sd[counter]=np.std(d4it)

                eigfreq1[counter] = np.copy(freq2)
                if (colnum == 0 or colnum == int(Nmitral / 2)):
                    cell_act[:, :, counter] = np.copy(yout2)
                counter += 1

    return dmgpct1, eigfreq1, d11, d21, d31, d41, pwr1, IPR1, IPR2, pn1, freq1, cell_act
Example #41
0
nnArr = []

# Disregard null values
for p in dataset.Price:
    if math.isnan(p):
        pass
    else:
        nnArr.append(int(p))

# Calculate mean
mean = numpy.mean(nnArr)
# Calculate median
median = numpy.median(nnArr)
# Calculate mode
mode = stats.mode(nnArr, axis=None)

#----------------------------------

#Variability - Range
statrange = numpy.ptp(nnArr)
#Variability - Interquartile Range
q3, q1 = numpy.percentile(nnArr, [75, 25])
iqr = q3 - q1
#Variability - Variance
variance = statistics.variance(nnArr)
#Variance - Standard Deviation
stddeviation = numpy.std(nnArr, ddof=1)

#Print values
print("Mean: {}".format(mean))
# 1st Solution
import numpy as np
from scipy import stats

size = int(input())
numbers = list(map(int, input().split()))

print(np.mean(numbers))
print(np.median(numbers))
print(int(stats.mode(numbers)[0]))



# 2nd Solution
import operator
input()
n = list(map(int, input().split()))
w = list(map(int, input().split()))
print('{0:.1f}'.format(sum(map(operator.mul, n, w))/sum(w)))
Example #43
0
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'poolcnt', 'pooltypeid7', 'propertycountylandusecode',
       'propertylandusetypeid', 'propertyzoningdesc', 'rawcensustractandblock',
       'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip',
       'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yearbuilt',
       'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
       'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount',
       'censustractandblock']

for column in column_names2:
    print("% of null values",column," = ",(pd.isnull(properties1[column]).sum()/2985217)*100)

df = train.merge(properties1, how = 'left',on = 'parcelid')

from scipy.stats import mode
df['buildingqualitytypeid'].fillna(mode(df['buildingqualitytypeid']).mode[0],inplace = True)
df['calculatedbathnbr'].fillna(mode(df['calculatedbathnbr']).mode[0],inplace = True)
df['calculatedfinishedsquarefeet'].fillna(mode(df['calculatedfinishedsquarefeet']).mode[0],inplace = True)
df['finishedsquarefeet12'].fillna(mode(df['finishedsquarefeet12']).mode[0],inplace = True)
df['fullbathcnt'].fillna(mode(df['fullbathcnt']).mode[0],inplace = True)
df['garagecarcnt'].fillna(mode(df['garagecarcnt']).mode[0],inplace = True)
df['garagetotalsqft'].fillna(mode(df['garagetotalsqft']).mode[0],inplace = True)
df['heatingorsystemtypeid'].fillna(mode(df['heatingorsystemtypeid']).mode[0],inplace = True)
df['lotsizesquarefeet'].fillna(mode(df['lotsizesquarefeet']).mode[0],inplace = True)
df['propertycountylandusecode'].fillna('0100', inplace = True)
df['propertyzoningdesc'].fillna('LAR3',inplace = True)
df['regionidcity'].fillna(mode(df['regionidcity']).mode[0],inplace = True)

df.drop(['regionidneighborhood'],axis = 1, inplace = True)

column_names2.remove('regionidneighborhood')
Example #44
0
        def get_statistics(atoms, SelectedHingeResidues, filename='Output'):
            """This sub-method is used to get the statistical data on the hinges and print it into a file.
            
            Notes:
                * * Function level: 1 (1 being top)
                * Do something about the output file

            Args:
                atoms ([packman.molecule.Atom])                   : Set of atoms. (Read parent method description)
                SelectedHingeResidues ([packman.molecule.Residue]): Predicted hinge residues. 
                filename (str, optional)                          : Output file name. Defaults to 'Output'.
            
            Returns:
                [p-value, stats] (float): p-value of the predicted hinge, statistics of the hinge (in that order)
            """
            hinge_atoms = [i.get_backbone() for i in SelectedHingeResidues]
            hinge_atoms = [item for sublist in hinge_atoms for item in sublist]
            non_hinge_atoms = list(set([i for i in atoms]) - set(hinge_atoms))
            all_atoms_bfactor = [i.get_bfactor() for i in atoms]
            hinge_atoms_bfactor = [i.get_bfactor() for i in hinge_atoms]
            non_hinge_atoms_bfactor = [
                i.get_bfactor() for i in non_hinge_atoms
            ]

            return_stats = []

            outputfile.write(
                '\nSTATISTICS\n\t\tN\tMin\tMax\tMean\tMode\tMedian\tSTDDev\n')
            return_stats.append(
                ['', 'N', 'Min', 'Max', 'Mean', 'Mode', 'Median', 'STDDev'])
            outputfile.write('Total   ' + '\t' + str(len(all_atoms_bfactor)) +
                             '\t' + str(numpy.min(all_atoms_bfactor)) + '\t' +
                             str(numpy.max(all_atoms_bfactor)) + '\t' +
                             str(numpy.mean(all_atoms_bfactor)) + '\t' +
                             str(mode(all_atoms_bfactor)[0][0]) + '\t' +
                             str(numpy.median(all_atoms_bfactor)) + '\t' +
                             str(numpy.std(all_atoms_bfactor)) + '\n')
            return_stats.append([
                'Total',
                len(all_atoms_bfactor),
                numpy.min(all_atoms_bfactor),
                numpy.max(all_atoms_bfactor),
                numpy.mean(all_atoms_bfactor),
                mode(all_atoms_bfactor)[0][0],
                numpy.median(all_atoms_bfactor),
                numpy.std(all_atoms_bfactor)
            ])
            outputfile.write('Hinge   ' + '\t' +
                             str(len(hinge_atoms_bfactor)) + '\t' +
                             str(numpy.min(hinge_atoms_bfactor)) + '\t' +
                             str(numpy.max(hinge_atoms_bfactor)) + '\t' +
                             str(numpy.mean(hinge_atoms_bfactor)) + '\t' +
                             str(mode(hinge_atoms_bfactor)[0][0]) + '\t' +
                             str(numpy.median(hinge_atoms_bfactor)) + '\t' +
                             str(numpy.std(hinge_atoms_bfactor)) + '\n')
            return_stats.append([
                'Hinge',
                len(hinge_atoms_bfactor),
                numpy.min(hinge_atoms_bfactor),
                numpy.max(hinge_atoms_bfactor),
                numpy.mean(hinge_atoms_bfactor),
                mode(hinge_atoms_bfactor)[0][0],
                numpy.median(hinge_atoms_bfactor),
                numpy.std(hinge_atoms_bfactor)
            ])
            outputfile.write('NonHinge' + '\t' +
                             str(len(non_hinge_atoms_bfactor)) + '\t' +
                             str(numpy.min(non_hinge_atoms_bfactor)) + '\t' +
                             str(numpy.max(non_hinge_atoms_bfactor)) + '\t' +
                             str(numpy.mean(non_hinge_atoms_bfactor)) + '\t' +
                             str(mode(non_hinge_atoms_bfactor)[0][0]) + '\t' +
                             str(numpy.median(non_hinge_atoms_bfactor)) +
                             '\t' + str(numpy.std(non_hinge_atoms_bfactor)) +
                             '\n')
            return_stats.append([
                'NonHinge',
                len(non_hinge_atoms_bfactor),
                numpy.min(non_hinge_atoms_bfactor),
                numpy.max(non_hinge_atoms_bfactor),
                numpy.mean(non_hinge_atoms_bfactor),
                mode(non_hinge_atoms_bfactor)[0][0],
                numpy.median(non_hinge_atoms_bfactor),
                numpy.std(non_hinge_atoms_bfactor)
            ])

            p_value = permutation_test(hinge_atoms_bfactor,
                                       non_hinge_atoms_bfactor,
                                       method='approximate',
                                       num_rounds=10000,
                                       seed=0)
            outputfile.write('\np-value:\t' + str(p_value) + '\n')
            return p_value, return_stats
Example #45
0
    def sum_org_images_of_type(self,
                               messages,
                               has_filters,
                               type_name,
                               dir_name,
                               master_file_name=None):
        """Generates a summary for the images.
        
        Args:    
            messages: List where the messages are added.
            has_filters: Indicates if the directories are organized by filters.
            type_name: Name of the type of image analyzed.
            dir_name: Directory to walk to search for images.
            master_file_name: Name of the master file, if any.
                    
        """

        messages.append(["> Summary for %s files." % (type_name)])

        subdirectories, files, directories_from_root = \
            self.walk_directories(self._target_dir, "*", dir_name, True)

        # Number of directories with data (from root).
        number_of_directories = len(directories_from_root)

        if has_filters:
            # Store a list of unique filters. Take all the paths and split them
            # to get the filter component, and add all to a set, that is converted
            # to a list.
            filters = list(set([s.split(os.sep)[-1] for s in subdirectories]))

            messages.append([
                "Number of filters with %s files is: %d" %
                (type_name, len(filters))
            ])

            messages.append(
                ["Filters used by %s: %s" % (type_name, str(filters))])

        # Get the list of directories found containing files.
        unique_paths = set([ff[PATH_COL] for ff in files])

        # Summary: Number of unique directories.
        messages.append(
            ["Number of %s directories: %d" % (type_name, len(unique_paths))])

        # The number of files in each directory is stored here.
        num_files_by_dir = []

        # Number of mater files found in each directory, it is calculated only
        #
        num_master = 0

        # Apply the following statistics if these files are used to create a mater
        # file (i.e. bias or flats).
        if master_file_name is not None:
            # Summary: Number of master files created.
            master = [fb for fb in files if fb[FILE_NAME_COL] == \
                      master_file_name]
            num_master = len(master)
            messages.append(
                ["Number of master %s: %d" % (type_name, num_master)])

            # Number of directories with files and without master Important!).
            dir_without_master = []

            for ubp in unique_paths:
                # Get the files of each directory.
                files_of_dir = [bf for bf in files if bf[PATH_COL] == ubp]

                # Get the master file of this directory if any.
                master_file = [bf for bf in files_of_dir \
                                 if bf[FILE_NAME_COL] == master_file_name]

                # If this directory has not master.
                if len(master_file) == 0:
                    dir_without_master.extend([ubp])

                # The number of files in this directory is the total number
                # of files minus the master fits found.
                num_of_files = len(files_of_dir) - len(master_file)
                num_files_by_dir.extend([num_of_files])

                messages.append([
                    "Directory: '%s' Number of files: %d" % (ubp, num_of_files)
                ])

            # Summary: Number of directories with files and no master.
            messages.append([
                "Number of directories with %s and no master %s: %d" %
                (type_name, type_name, len(dir_without_master))
            ])

            # If any directory has no master, show its path.
            if len(dir_without_master) > 0:
                messages.append([
                    "Directories without master %s : %s" %
                    (type_name, str(dir_without_master))
                ])

        else:
            # Count the number of files in each directory. Now taking into
            # account names of files and its number instead of master files.
            for ubp in unique_paths:

                # Objects in the directory whose path matched those of unique set.
                all_objects_of_dir = [ f[FILE_NAME_COL] for f in files \
                                      if f[PATH_COL] == ubp ]

                # Take as objects names those of FIT images, not final, and only
                # the part name that identifies the object.
                objects_of_dir = [ o[:o.find(DATANAME_CHAR_SEP)]
                                  for o in all_objects_of_dir \
                                  if o.find("." + FIT_FILE_EXT) > 0 and
                                  o.find(DATA_FINAL_SUFFIX) < 0 ]

                # The number of files in this directory is the total number
                # of files minus the master fits found.
                num_files_by_dir.extend([len(objects_of_dir)])

                messages.append([
                    "Directory: '%s' Number of files: %d" %
                    (ubp, len(objects_of_dir))
                ])

                unique_objects_of_dir = set(objects_of_dir)

                for uo in unique_objects_of_dir:
                    num_objs = len([o for o in objects_of_dir if o == uo])

                    messages.append(
                        ["Object: '%s' Number of files: %d" % (uo, num_objs)])

        # Create a set containing the root directories that contains files.
        # The source set contains a directory for each filter, so it may
        # contain several directories for each root directory.
        unique_root_dir_with_files = set(
            [x.split(os.sep)[1] for x in unique_paths])

        # Summary: Number of directories without files.
        # The total number of minus the number of directories without files.
        num_dir_without_files = \
            number_of_directories - len(unique_root_dir_with_files)
        messages.append([
            "Number of directories without %s files: %d" %
            (type_name, num_dir_without_files)
        ])

        # Summary: Number of files.
        num_files = sum(num_files_by_dir)
        messages.append(["Number of %s files is: %d" % (type_name, num_files)])

        if len(num_files_by_dir) > 0:
            max_files_by_dir = max(num_files_by_dir)
            min_files_by_dir = min(num_files_by_dir)
            avg_files_by_dir = sum(num_files_by_dir) / len(num_files_by_dir)
            std_files_by_dir = np.std(num_files_by_dir)
            med_files_by_dir = np.median(num_files_by_dir)
            mode_files_by_dir = mode(num_files_by_dir)[0][0]
        else:
            max_files_by_dir = 0
            min_files_by_dir = 0
            avg_files_by_dir = 0
            std_files_by_dir = 0
            med_files_by_dir = 0
            mode_files_by_dir = 0

        # Summary: Maximum number of files in directories.
        messages.append([
            "Maximum number of %s files in directories: %d" %
            (type_name, max_files_by_dir)
        ])

        # Summary: Minimum number of files in directories.
        messages.append([
            "Minimum number of %s files in directories: %d" %
            (type_name, min_files_by_dir)
        ])

        # Summary: Average of number of files in directories.
        messages.append([
            "Average of number of %s files in directories: %.10g" %
            (type_name, avg_files_by_dir)
        ])

        # Summary: Standard deviation of number of files in directories.
        messages.append([
            "Standard deviation of number of %s files in directories: %.10g" %
            (type_name, std_files_by_dir)
        ])

        # Summary: Median of number of files in directories.
        messages.append([
            "Median of number of %s files in directories: %.10g" %
            (type_name, med_files_by_dir)
        ])

        # Summary: Mode of number of files in directories.
        messages.append([
            "Mode of number of %s files in directories: %.10g" %
            (type_name, mode_files_by_dir)
        ])
Example #46
0
incomes = np.random.normal(27000, 15000, 10000)
#loc=150, scale=20, size=1000

print(type(incomes))
print(incomes.size)
print(incomes)
print(len(incomes))
print(incomes.ndim)
print(incomes.shape)
print(incomes.dtype)

print("Mean value is: ", np.mean(incomes))
print("Median value is: ", np.median(incomes))

from scipy import stats
print("Mode value is: ", stats.mode(incomes)[0])

print("Minimum value is: ", np.min(incomes))
print("Maximum value is: ", np.max(incomes))
print("Standard Deviation is: ", np.std(incomes))
#print("Correlation coefficient value is: ", np.corrcoef(incomes))

#We can segment the income data into 50 buckets, and plot it as a histogram:
import matplotlib.pyplot as plt
plt.hist(incomes, 20)
plt.show()

#box and whisker plot to show distribution
#https://chartio.com/resources/tutorials/what-is-a-box-plot/
plt.boxplot(incomes)
"""
Example #47
0
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

df = pd.read_csv('train.csv')

df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

age_mean = df['Age'].mean()
age_median = df['Age'].median()

embarked_mode = mode(df['Embarked'])[0][0]
df['Embarked'] = df['Embarked'].fillna(embarked_mode)

df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)

df = df.drop(['Sex', 'Embarked'], axis=1)

cols = df.columns.tolist()
cols = [cols[1]] + cols[0:1] + cols[2:]
df = df[cols]

df = df.fillna(-1)

train_data = df.values
"""
Example #48
0
 def _fraserMode(self, multi=0.1):
     y = num.array(self.data * multi).astype(int)
     mode = stats.mode(y)[0]
     w = num.where(y == mode)
     return num.median(self.data[w[0]])
Example #49
0
    def track(self, poses, identities=None):
        self.n_frames += 1

        trackers = np.zeros((len(self.trackers), 6))
        for i in range(len(trackers)):
            trackers[i, :5] = self.trackers[i].predict()
        empty = np.isnan(trackers).any(axis=1)
        trackers = trackers[~empty]
        for ind in np.flatnonzero(empty)[::-1]:
            self.trackers.pop(ind)

        ellipses = []
        pred_ids = []
        for i, pose in enumerate(poses):
            el = self.fitter.fit(pose)
            if el is not None:
                ellipses.append(el)
                if identities is not None:
                    pred_ids.append(mode(identities[i])[0][0])
        if not len(trackers):
            matches = np.empty((0, 2), dtype=int)
            unmatched_detections = np.arange(len(ellipses))
            unmatched_trackers = np.empty((0, 6), dtype=int)
        else:
            ellipses_trackers = [Ellipse(*t[:5]) for t in trackers]
            cost_matrix = np.zeros((len(ellipses), len(ellipses_trackers)))
            for i, el in enumerate(ellipses):
                for j, el_track in enumerate(ellipses_trackers):
                    cost = el.calc_similarity_with(el_track)
                    if identities is not None:
                        match = 2 if pred_ids[i] == self.trackers[j].id_ else 1
                        cost *= match
                    cost_matrix[i, j] = cost
            row_indices, col_indices = linear_sum_assignment(cost_matrix, maximize=True)
            unmatched_detections = [
                i for i, _ in enumerate(ellipses) if i not in row_indices
            ]
            unmatched_trackers = [
                j for j, _ in enumerate(trackers) if j not in col_indices
            ]
            matches = []
            for row, col in zip(row_indices, col_indices):
                val = cost_matrix[row, col]
                # diff = val - cost_matrix
                # diff[row, col] += val
                # if (
                #         val < self.iou_threshold
                #         or np.any(diff[row] <= 0.2)
                #         or np.any(diff[:, col] <= 0.2)
                # ):
                if val < self.iou_threshold:
                    unmatched_detections.append(row)
                    unmatched_trackers.append(col)
                else:
                    matches.append([row, col])
            if not len(matches):
                matches = np.empty((0, 2), dtype=int)
            else:
                matches = np.stack(matches)
            unmatched_trackers = np.asarray(unmatched_trackers)
            unmatched_detections = np.asarray(unmatched_detections)

        animalindex = []
        for t, tracker in enumerate(self.trackers):
            if t not in unmatched_trackers:
                ind = matches[matches[:, 1] == t, 0][0]
                animalindex.append(ind)
                tracker.update(ellipses[ind].parameters)
            else:
                animalindex.append(-1)

        for i in unmatched_detections:
            trk = EllipseTracker(ellipses[i].parameters)
            if identities is not None:
                trk.id_ = mode(identities[i])[0][0]
            self.trackers.append(trk)
            animalindex.append(i)

        i = len(self.trackers)
        ret = []
        for trk in reversed(self.trackers):
            d = trk.state
            if (trk.time_since_update < 1) and (
                trk.hit_streak >= self.min_hits or self.n_frames <= self.min_hits
            ):
                ret.append(
                    np.concatenate((d, [trk.id, int(animalindex[i - 1])])).reshape(
                        1, -1
                    )
                )  # for DLC we also return the original animalid
                # +1 as MOT benchmark requires positive >> this is removed for DLC!
            i -= 1
            # remove dead tracklet
            if trk.time_since_update > self.max_age:
                self.trackers.pop(i)

        if len(ret) > 0:
            return np.concatenate(ret)
        return np.empty((0, 7))
Example #50
0
for i in range(len(x_test_num_avg)-1):
    x_test_num_concat = pd.concat([x_test_num_concat,x_test_num_avg[i+1][1]], axis=1)
x_test_num_avg = x_test_num_concat.groupby(by=x_test_num_concat.columns, axis=1).apply(lambda g: g.mean(axis=1))


# Categorical Imputer
imputer_txt = MultipleImputer(strategy='categorical', return_list=True, n=10, seed=101)
x_train_txt_avg = imputer_txt.fit_transform(x_train_txt)

x_train_txt_col = list(x_train_txt.columns)
x_train_txt_col.sort()
x_train_txt_concat = x_train_txt_avg[0][1]

for i in range(len(x_train_txt_avg)-1):
    x_train_txt_concat = pd.concat([x_train_txt_concat, x_train_txt_avg[i+1][1]], axis=1)
x_train_txt_avg = x_train_txt_concat.groupby(by=x_train_txt_concat.columns, axis=1).apply(lambda g: stats.mode(g, axis=1)[0])
x_train_txt_avg = x_train_txt_avg.sort_index(axis=0)
x_train_txt_avg_temp = pd.DataFrame(x_train_txt_avg[0])
for i in range(len(x_train_txt_avg)-1):
    x_train_txt_avg_temp = pd.concat([x_train_txt_avg_temp,pd.DataFrame(x_train_txt_avg[i+1])], axis=1)
x_train_txt_avg_temp.columns = x_train_txt_col
x_train_txt_avg = x_train_txt_avg_temp
x_train_txt = x_train_txt.sort_index(axis=1)


x_test_txt_avg = imputer_txt.fit_transform(x_test_txt)

x_test_txt_col = list(x_test_txt.columns)
x_test_txt_col.sort()
x_test_txt_concat = x_test_txt_avg[0][1]
def app():
    global input_df

    st.title('Home')

    st.write(
        'First adjust the backtest parameters on the left, then launch the backtest by pressing the button below.'
    )

    st.sidebar.header("Backtest parameters")

    with st.form("input_params"):

        session_state.startdate = st.sidebar.date_input(
            'start date',
            value=session_state.startdate,
            min_value=datetime.strptime('1900-01-01', '%Y-%m-%d'),
            max_value=date.today(),
            key='startdate',
            help='start date of the backtest')
        session_state.enddate = st.sidebar.date_input(
            'end date',
            value=session_state.enddate,
            min_value=datetime.strptime('1900-01-01', '%Y-%m-%d'),
            max_value=date.today(),
            key='enddate',
            help='end date of the backtest')

        session_state.initial_cash = st.sidebar.number_input(
            "initial cash",
            min_value=0.0,
            max_value=None,
            value=session_state.initial_cash,
            step=1000.0,
            format='%f',
            key='initial_cash',
            help='initial cash')
        session_state.contribution = st.sidebar.number_input(
            "contribution or withdrawal",
            min_value=None,
            max_value=None,
            value=session_state.contribution,
            format='%f',
            step=0.01,
            key='contribution',
            help=
            'contribution or withdrawal. Can be specified as % of the portfolio value or in absolute terms.'
        )
        session_state.leverage = st.sidebar.number_input(
            "leverage",
            min_value=1.0,
            max_value=None,
            step=0.01,
            value=session_state.leverage,
            format='%f',
            key='leverage',
            help='daily leverage to apply to assets returns')
        session_state.expense_ratio = st.sidebar.number_input(
            "expense ratio",
            min_value=0.0,
            max_value=1.0,
            step=0.01,
            value=session_state.expense_ratio,
            format='%f',
            key='expense_ratio',
            help='annual expense ratio')

        st.sidebar.subheader("Assets")

        if session_state.historic == "Yahoo Finance (daily prices)":
            idx = 0
        elif session_state.historic == "Historical DB (daily prices)":
            idx = 1
        else:
            idx = 2

        session_state.historic = st.sidebar.radio(
            'data source',
            ("Yahoo Finance (daily prices)", "Historical DB (daily prices)",
             "Historical DB (yearly prices)"),
            index=idx,
            key='historic',
            help='choose the data source')
        if session_state.historic == "Yahoo Finance (daily prices)":
            historic_cd = None
        elif session_state.historic == "Historical DB (daily prices)":
            historic_cd = 'medium'
        elif session_state.historic == "Historical DB (yearly prices)":
            historic_cd = 'long'

        session_state.shares = st.sidebar.text_area(
            "assets to backtest",
            value=session_state.shares,
            height=None,
            max_chars=None,
            key="shares",
            help='tickers in a comma separated list (e.g. "SPY,TLT,GLD")')
        session_state.shareclass = st.sidebar.text_area(
            "assets class (for Yahoo Finance only)",
            value=session_state.shareclass,
            height=None,
            max_chars=None,
            key="shareclass",
            help=
            'class of each asset (e.g. `equity,bond_lt,gold`). Possibilities are `equity, bond_lt, bond_it, gold, commodity`, where "bond_lt" and "bond_it" are long and intermediate duration bonds, respectively. __This argument is mandatory when the data source is Yahoo Finance.'
        )
        session_state.weights = st.sidebar.text_area(
            "asset weights",
            value=session_state.weights,
            height=None,
            max_chars=None,
            key="weights",
            help=
            'list of portfolio weights for each asset specified (e.g. `0.35,0.35,0.30`). The weights need to sum to 1. When weights are specified a custom weights strategy is used that simply loads the weights specified. Alternative is to use a pre-defined strategy.'
        )

        session_state.benchmark = st.sidebar.text_input(
            "benchmark",
            value=session_state.benchmark,
            max_chars=None,
            key='benchmark',
            help='ticker of a benchmark')
        session_state.indicator = st.sidebar.checkbox(
            "signal assets",
            value=session_state.indicator,
            key='indicators',
            help='load the signal assets needed for the rotation strategy')

        st.sidebar.subheader("Strategies")

        session_state.riskparity = st.sidebar.checkbox(
            'risk parity',
            value=session_state.riskparity,
            key='riskparity',
            help=
            'Dynamic allocation of weights according to the risk parity methodology (see https://thequantmba.wordpress.com/2016/12/14/risk-parityrisk-budgeting-portfolio-in-python/). Here the risk parity is run at portfolio level.'
        )
        session_state.riskparity_nested = st.sidebar.checkbox(
            'risk parity nested',
            value=session_state.riskparity_nested,
            key='riskparity_nested',
            help=
            'Dynamic allocation of weights according to the risk parity methodology (see https://thequantmba.wordpress.com/2016/12/14/risk-parityrisk-budgeting-portfolio-in-python/). Here the risk parity is run first at asset classe level (for assets belonging to the same asset class) and then at portfolio level.'
        )
        session_state.rotationstrat = st.sidebar.checkbox(
            'asset rotation',
            value=session_state.rotationstrat,
            key='rotationstrat',
            help=
            'Asset rotation strategy that buy either gold, bonds or equities based on a signal (see https://seekingalpha.com/article/4283733-simple-rules-based-asset-rotation-strategy). To use this strategy tick the box signal assets.'
        )
        session_state.uniform = st.sidebar.checkbox(
            'uniform',
            value=session_state.uniform,
            key='uniform',
            help=
            'Static allocation uniform across asset classes. Assets are allocated uniformly within the same asset class.'
        )
        session_state.vanillariskparity = st.sidebar.checkbox(
            'static risk parity',
            value=session_state.vanillariskparity,
            key='vanillariskparity',
            help=
            'Static allocation to asset classes where weights are taken from https://www.theoptimizingblog.com/leveraged-all-weather-portfolio/ (see section "True Risk Parity").'
        )
        session_state.onlystocks = st.sidebar.checkbox(
            'only equity',
            value=session_state.onlystocks,
            key='onlystocks',
            help=
            'Static allocation only to the equity class. Assets are allocated uniformly within the equity class.'
        )
        session_state.sixtyforty = st.sidebar.checkbox(
            '60% equities 40% bonds',
            value=session_state.sixtyforty,
            key='sixtyforty',
            help=
            'Static allocation 60% to the equity class, 20% to the Long Term Bonds class and 20% to the Short Term Bonds class. Assets are allocated uniformly within the asset classes.'
        )
        session_state.trend_u = st.sidebar.checkbox(
            'trend uniform',
            value=session_state.trend_u,
            key='trend_u',
            help=
            'First weights are assigned according to the "uniform" strategy. Then, if the current asset price is smaller than the simple moving average, the weight is set to zero (leave as cash).'
        )
        session_state.absmom_u = st.sidebar.checkbox(
            'absolute momentum uniform',
            value=session_state.absmom_u,
            key='absmom_u',
            help=
            'First weights are assigned according to the "uniform" strategy. Then, if the asset return over the period (momentum) is less than 0, the weight is set to zero (leave as cash).'
        )
        session_state.relmom_u = st.sidebar.checkbox(
            'relative momentum uniform',
            value=session_state.relmom_u,
            key='relmom_u',
            help=
            'First assets are ranked based on their return over the period (momentum) and divided in two classes. The portfolio is formed by the assets belonging to the higher return class. Then, weights are assigned to this portfolio according to the "uniform" strategy.'
        )
        session_state.momtrend_u = st.sidebar.checkbox(
            'relative momentum & trend uniform',
            value=session_state.momtrend_u,
            key='momtrend_u',
            help=
            'First weights are assigned according to the "uniform" strategy. Second, assets are ranked based on their return over the period (momentum) and divided in two classes. For the assets belonging to the lower return class, the weight is set to zero (leave as cash). Finally, a trend filter is then applied to assets with positive weight: if the current asset price is smaller than the simple moving average, the weight is set to zero (leave as cash).'
        )
        session_state.trend_rp = st.sidebar.checkbox(
            'trend risk parity',
            value=session_state.trend_rp,
            key='trend_rp',
            help=
            'First weights are assigned according to the "riskparity" strategy. Then, if the current asset price is smaller than the simple moving average, the weight is set to zero (leave as cash).'
        )
        session_state.absmom_rp = st.sidebar.checkbox(
            'absolute momentum risk parity',
            value=session_state.absmom_rp,
            key='absmom_rp',
            help=
            'First weights are assigned according to the "riskparity" strategy. Then, if the asset return over the period (momentum) is less than 0, the weight is set to zero (leave as cash).'
        )
        session_state.relmom_rp = st.sidebar.checkbox(
            'relative momentum risk parity',
            value=session_state.relmom_rp,
            key='relmom_rp',
            help=
            'First assets are ranked based on their return over the period (momentum) and divided in two classes. The portfolio is formed by the assets belonging to the higher return class. Then, weights are assigned to this portfolio according to the "risk parity" strategy.'
        )
        session_state.momtrend_rp = st.sidebar.checkbox(
            'relative momentum & trend risk parity',
            value=session_state.momtrend_rp,
            key='momtrend_rp',
            help=
            'First weights are assigned according to the "riskparity" strategy. Second, assets are ranked based on their return over the period (momentum) and divided in two classes. For the assets belonging to the lower return class, the weight is set to zero (leave as cash). Finally, a trend filter is then applied to assets with positive weight: if the current asset price is smaller than the simple moving average, the weight is set to zero (leave as cash).'
        )
        session_state.GEM = st.sidebar.checkbox(
            'Global equity momentum',
            value=session_state.GEM,
            key='GEM',
            help=
            'Global equity momentum strategy. Needs only 4 assets of classes equity, equity_intl, bond_lt, money_market. example: `VEU,IVV,BIL,AGG equity_intl,equity,money_market,bond_lt`. See https://blog.thinknewfound.com/2019/01/fragility-case-study-dual-momentum-gem/'
        )
        session_state.acc_dualmom = st.sidebar.checkbox(
            'Accelerating Dual Momentum',
            value=session_state.acc_dualmom,
            key='acc_dualmom',
            help=
            'Accelerating Dual Momentum. Needs only 3 assets of classes equity, equity_intl, bond_lt. example: VFINX,VINEX,VUSTX, shareclass equity,equity_intl,bond_lt. See https://engineeredportfolio.com/2018/05/02/accelerating-dual-momentum-investing/'
        )
        session_state.acc_dualmom2 = st.sidebar.checkbox(
            'Accelerating Dual Momentum (extended)',
            value=session_state.acc_dualmom2,
            key='acc_dualmom2',
            help=
            'Accelerating Dual Momentum (extended). Needs only 4 assets of classes equity, equity_intl, bond_lt, gold. example: VFINX,VINEX,VUSTX,GLD shareclass equity,equity_intl,bond_lt,gold.'
        )

        st.sidebar.subheader("HTML Report")
        # session_state.create_report = st.sidebar.checkbox('create PDF report', value=session_state.create_report,
        #                                                   key='create_report', help=None)
        session_state.report_name = st.sidebar.text_input(
            "report name",
            value=session_state.report_name,
            max_chars=None,
            key='report_name',
            help=None)
        session_state.user = st.sidebar.text_input(
            "user name",
            value=session_state.user,
            max_chars=None,
            key='user',
            help='user generating the report')
        session_state.memo = st.sidebar.text_input(
            "report memo",
            value=session_state.memo,
            max_chars=None,
            key='memo',
            help='description of the report')

        #launch_btn = st.button("Launch backtest")
        launch_btn = st.form_submit_button("Launch backtest")

    if launch_btn:
        params['startdate'] = session_state.startdate
        params['enddate'] = session_state.enddate
        params['initial_cash'] = session_state.initial_cash
        params['contribution'] = session_state.contribution
        params['leverage'] = session_state.leverage
        params['expense_ratio'] = session_state.expense_ratio
        params['historic'] = historic_cd
        params['shares'] = session_state.shares
        params['shareclass'] = session_state.shareclass
        params['weights'] = session_state.weights
        params['benchmark'] = session_state.benchmark
        params['indicator'] = session_state.indicator
        params['riskparity'] = session_state.riskparity
        params['riskparity_nested'] = session_state.riskparity_nested
        params['rotationstrat'] = session_state.rotationstrat
        params['uniform'] = session_state.uniform
        params['vanillariskparity'] = session_state.vanillariskparity
        params['onlystocks'] = session_state.onlystocks
        params['sixtyforty'] = session_state.sixtyforty
        params['trend_u'] = session_state.trend_u
        params['absmom_u'] = session_state.absmom_u
        params['relmom_u'] = session_state.relmom_u
        params['momtrend_u'] = session_state.momtrend_u
        params['trend_rp'] = session_state.trend_rp
        params['absmom_rp'] = session_state.absmom_rp
        params['relmom_rp'] = session_state.relmom_rp
        params['momtrend_rp'] = session_state.momtrend_rp
        params['GEM'] = session_state.GEM
        params['acc_dualmom'] = session_state.acc_dualmom
        params['acc_dualmom2'] = session_state.acc_dualmom2
        params['create_report'] = session_state.create_report
        params['report_name'] = session_state.report_name
        params['user'] = session_state.user
        params['memo'] = session_state.memo
        # advanced params
        params['DAYS_IN_YEAR'] = session_state.DAYS_IN_YEAR
        params[
            'DAYS_IN_YEAR_BOND_PRICE'] = session_state.DAYS_IN_YEAR_BOND_PRICE
        params['reb_days_days'] = session_state.reb_days_days
        params['reb_days_years'] = session_state.reb_days_years
        params['reb_days_custweights'] = session_state.reb_days_custweights
        params[
            'lookback_period_short_days'] = session_state.lookback_period_short_days
        params[
            'lookback_period_short_years'] = session_state.lookback_period_short_years
        params[
            'lookback_period_short_custweights'] = session_state.lookback_period_short_custweights
        params[
            'lookback_period_long_days'] = session_state.lookback_period_long_days
        params[
            'lookback_period_long_years'] = session_state.lookback_period_long_years
        params[
            'lookback_period_long_custweights'] = session_state.lookback_period_long_custweights
        params[
            'moving_average_period_days'] = session_state.moving_average_period_days
        params[
            'moving_average_period_years'] = session_state.moving_average_period_years
        params[
            'moving_average_period_custweights'] = session_state.moving_average_period_custweights
        params['momentum_period_days'] = session_state.momentum_period_days
        params['momentum_period_years'] = session_state.momentum_period_years
        params[
            'momentum_period_custweights'] = session_state.momentum_period_custweights
        params[
            'momentum_percentile_days'] = session_state.momentum_percentile_days
        params[
            'momentum_percentile_years'] = session_state.momentum_percentile_years
        params[
            'momentum_percentile_custweights'] = session_state.momentum_percentile_custweights
        params['corrmethod_days'] = session_state.corrmethod_days
        params['corrmethod_years'] = session_state.corrmethod_years
        params['corrmethod_custweights'] = session_state.corrmethod_custweights
        params['riskfree'] = session_state.riskfree
        params['targetrate'] = session_state.targetrate
        params['alpha'] = session_state.alpha
        params['market_mu'] = session_state.market_mu
        params['market_sigma'] = session_state.market_sigma
        params['stddev_sample'] = session_state.stddev_sample
        params['annualize'] = session_state.annualize
        params['logreturns'] = session_state.logreturns

    #if input_df != 0:
    mainout = main(params)
    if mainout is not False:
        input_df = copy.deepcopy(mainout)

        # Portfolio value
        idx = 0
        columns = input_df[idx].columns
        input_df[idx]['date'] = input_df[idx].index
        input_df_long = pd.melt(input_df[idx],
                                id_vars=['date'],
                                value_vars=columns,
                                var_name='strategy',
                                value_name='price')

        fig = px.line(input_df_long, x="date", y="price", color="strategy")

        st.markdown("### Portfolio value")
        st.plotly_chart(fig, use_container_width=True)

        # Portfolio drawdowns
        idx = 5  # find a smarter way later
        columns = input_df[idx].columns
        input_df[idx]['date'] = input_df[idx].index
        input_df_long = pd.melt(input_df[idx],
                                id_vars=['date'],
                                value_vars=columns,
                                var_name='strategy',
                                value_name='drawdown')

        fig = px.line(input_df_long, x="date", y="drawdown", color="strategy")

        st.markdown("### Portfolio drawdown")
        st.plotly_chart(fig, use_container_width=True)

        # Portfolio metrics
        st.markdown("### Portfolio metrics")
        st.dataframe(input_df[2])

        # Portfolio weights
        st.markdown("### Portfolio weights")
        # col1, col2 = st.beta_columns(2)
        #
        # idx = 3
        # columns=input_df[idx].columns
        # input_df[idx]['date'] = input_df[idx].index
        # input_df_long = pd.melt(input_df[idx], id_vars=['date','strategy'], value_vars=columns[0:-1],var_name='asset', value_name='weight')
        #
        # col1.subheader("Target weights")
        #
        # for strat in input_df_long['strategy'].unique():
        #     fig = px.bar(input_df_long[input_df_long['strategy']==strat], x="date", y="weight", color="asset", title=strat + ' weights')
        #     col1.plotly_chart(fig, use_container_width=True)
        idx = 4
        columns = input_df[idx].columns
        input_df[idx]['date'] = input_df[idx].index
        input_df_long = pd.melt(input_df[idx],
                                id_vars=['date', 'strategy'],
                                value_vars=columns[0:-1],
                                var_name='asset',
                                value_name='weight')

        st.subheader("Effective weights")

        for strat in input_df_long['strategy'].unique():
            fig = px.bar(input_df_long[input_df_long['strategy'] == strat],
                         x="date",
                         y="weight",
                         color="asset",
                         title=strat + ' weights')
            st.plotly_chart(fig, use_container_width=True)

        # Asset value
        idx = 6
        columns = input_df[idx].columns
        input_df[idx]['date'] = input_df[idx].index
        input_df_long = pd.melt(input_df[idx],
                                id_vars=['date'],
                                value_vars=columns,
                                var_name='asset',
                                value_name='price')

        fig = px.line(input_df_long, x="date", y="price", color="asset")

        st.markdown("### Assets value")
        st.plotly_chart(fig, use_container_width=True)

        # Assets drawdowns
        idx = 7  # find a smarter way later
        columns = input_df[idx].columns
        input_df[idx]['date'] = input_df[idx].index
        input_df_long = pd.melt(input_df[idx],
                                id_vars=['date'],
                                value_vars=columns,
                                var_name='asset',
                                value_name='drawdown')

        fig = px.line(input_df_long, x="date", y="drawdown", color="asset")

        st.markdown("### Assets drawdown")
        st.plotly_chart(fig, use_container_width=True)

        # # Portfolio Returns
        idx = 1
        # Determine the price frequency
        dates = []
        for i in range(1, len(input_df[idx].index)):
            dates.append(
                datetime.strptime(str(input_df[idx].index[i]), '%Y-%m-%d'))
        datediff = stats.mode(np.diff(dates))[0][0]
        if datediff > timedelta(days=250):
            frequency = "Years"
        elif datediff < timedelta(days=2):
            frequency = "Days"

        rolling_ret_period = st.slider(
            "rolling returns period (in years)",
            min_value=1,
            max_value=30,
            value=1,
            step=1,
            format='%i',
            key='rolling_ret_period',
            help='period of rolling annual return (in years)')

        if frequency == "Days":  # plot the rolling return (annualized)
            for column in input_df[idx]:
                if params['logreturns']:
                    input_df[idx][column] = (input_df[idx][column]).rolling(
                        window=params['DAYS_IN_YEAR'] *
                        rolling_ret_period).sum() / rolling_ret_period
                else:
                    input_df[idx][column] = (
                        1 + input_df[idx][column]).rolling(
                            window=params['DAYS_IN_YEAR'] *
                            rolling_ret_period).apply(
                                np.prod)**(1 / rolling_ret_period) - 1
        elif frequency == "Years":  # plot the rolling 5 years return
            for column in input_df[idx]:
                if params['logreturns']:
                    input_df[idx][column] = (input_df[idx][column]).rolling(
                        window=rolling_ret_period).mean()
                else:
                    input_df[idx][column] = (
                        1 + input_df[idx][column]).rolling(
                            window=rolling_ret_period).apply(np.prod) - 1

        columns = input_df[idx].columns
        input_df[idx]['date'] = input_df[idx].index
        input_df_long = pd.melt(input_df[idx],
                                id_vars=['date'],
                                value_vars=columns,
                                var_name='strategy',
                                value_name='rolling return')

        fig = px.line(input_df_long,
                      x="date",
                      y="rolling return",
                      color="strategy")

        st.markdown("### Portfolio returns")
        st.plotly_chart(fig, use_container_width=True)

        st.markdown("### Downloads area")

        today_str = datetime.today().strftime('%Y-%m-%d')
        outputfilename = [
            "Fund Prices", "Returns", "Performance Metrics", "Target Weights",
            "Effective Weights", "Portfolio Drawdown", "Asset Prices",
            "Assets drawdown"
        ]

        i = 0
        for name in outputfilename:
            inputfilepath = name + "_" + today_str + '.csv'
            tmp_download_link = utils.download_link(
                input_df[i], inputfilepath, 'Click here to download ' + name)
            st.markdown(tmp_download_link, unsafe_allow_html=True)
            i = i + 1

        inputfilepath = params['report_name'] + "_" + today_str + '.html'
        tmp_download_link = utils.download_link(
            input_df[8], inputfilepath,
            'Click here to download the html report')
        st.markdown(tmp_download_link, unsafe_allow_html=True)
Example #52
0
# %matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
from sklearn.cluster import KMeans

from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape

kmeans = KMeans(n_clusters = 10, random_state = 0)
clusters = kmeans.fit_predict(digits.data)
kmeans.cluster_centers_.shape

fig, ax = plt.subplots(2, 5, figsize=(8, 3))
centers = kmeans.cluster_centers_.reshape(10, 8, 8)
for axi, center in zip(ax.flat, centers):
              axi.set(xticks=[], yticks=[])
              axi.imshow(center, interpolation='nearest', cmap=plt.cm.binary)

from scipy.stats import mode
labels = np.zeros_like(clusters)
for i in range(10):
   mask = (clusters == i)
   labels[mask] = mode(digits.target[mask])[0]
from sklearn.metrics import accuracy_score
accuracy_score(digits.target, labels)

"""###The above output shows that the accuracy is around 80%."""

def votepredict(tot_predicted):
    tot_predicted = np.transpose(tot_predicted)
    vote_predicted = [mode(w).mode[0] for w in tot_predicted]
    return vote_predicted
Example #54
0
    def _extractFeature(self,
                        baseModality: str = "audio",
                        num_word: int = 1,
                        verbose:int = 0,
                        **kwargs):
        """
        recipe: dictionary, required
            file list to extract feature on each modality
        baseModality: string, optional, default="audio"
            base file length for aligning all the other modalities
        """
        # check arguments
        recipe = kwargs["recipe"]
        isFlattened = kwargs["isFlattened"]
        isOnehot = kwargs["isOnehot"]

        # extract feature from each file
        self.num_files = len(recipe[list(recipe.keys())[0]])
        if verbose > 0:
            fileIdxIterator = tqdm(np.arange(self.num_files), ascii=True, desc="extracting")
        else:
            fileIdxIterator = np.arange(self.num_files)

        features = dict()
        for fileIdx in fileIdxIterator:
            min_length = sys.maxsize
            for modality in recipe.keys():
                features_per_file = self.singleFileExtractor.getXy(fileName=recipe[modality][fileIdx],
                                                          modality=modality,
                                                          verbose=verbose)
                if modality in features.keys():
                    features[modality].append(features_per_file)
                else:
                    features[modality] = [features_per_file]
                if modality != "text":
                    min_length = min(min_length, len(features_per_file))

            # align length of each modality
            for modality in recipe.keys():
                features[modality][fileIdx] = features[modality][fileIdx][:min_length]

        if self.sample_shift > 0:
            feature_shape = dict()
            num_total_sample = dict()
            for modality in features.keys():
                for features_per_file in features[modality]:
                    # store the shapes in each modalities
                    if modality not in feature_shape.keys():
                        feature_shape[modality] = features_per_file[0].shape

                    # store the length in each modalities
                    if modality not in num_total_sample.keys():
                        num_total_sample[modality] = int( (len(features_per_file) - self.window_size) / self.sample_shift)
                    else:
                        num_total_sample[modality] += int( (len(features_per_file) - self.window_size) / self.sample_shift)

            print("feature_shape: {0}".format(feature_shape))
            print("num_total_sample: {0}".format(num_total_sample))
            base_num_sample = []
            for modality in features.keys():
                if verbose > 0:
                    print("sampling... modality:{0}".format(modality))

                # create empty array for samples per one modality
                if modality == "text":
                    samples = np.zeros((num_total_sample[baseModality], ) + num_word * feature_shape[modality])
                elif modality == "ref" or modality == "label":
                    samples = np.zeros((num_total_sample[modality], ) + feature_shape[modality])
                else:
                    if isFlattened:
                        samples = np.zeros((num_total_sample[modality], self.window_size * np.prod(feature_shape[modality])))
                    else:
                        samples = np.zeros((num_total_sample[modality], self.window_size) + feature_shape[modality])

                file_shift = 0
                for fileIdx, features_per_file in enumerate(features[modality]):
                    if modality == "text":
                        num_sample = base_num_sample[fileIdx]
                        num_word_per_file = len(features_per_file)
                    else:
                        num_sample = int( (len(features_per_file) - self.window_size) / self.sample_shift)

                    # store number of samples at each file on base modality
                    if modality == baseModality:
                        base_num_sample.append(num_sample)

                    for sampleIdx in range(num_sample):
                        if modality == "text":
                            start = int(sampleIdx / num_sample * num_word_per_file)
                            end = int(sampleIdx / num_sample * num_word_per_file) + 1
                        else:
                            start = sampleIdx * self.sample_shift
                            end = sampleIdx * self.sample_shift + self.window_size

                        if modality == "ref" or modality == "label":
                            mode_val, mode_num = stats.mode(features_per_file[start:end])
                            sample = mode_val
                        else:
                            if isFlattened:
                                sample = np.array(features_per_file[start:end]).flatten()
                            else:
                                sample = np.array(features_per_file[start:end])
                        samples[file_shift + sampleIdx] = sample
                    file_shift += num_sample
                features[modality] = samples
        return features
Example #55
0
    depths = np.arange(1, num_depths + 1)
    for depth in depths:
        y = np.zeros((n, len(test_data)))
        # We are assuming that N(x) = 0, so there's no noise. This means y_star = y_t
        y_star = t = test_labels
        for i in range(n):
            boot_data, boot_labels = bootstrap_replicate(
                train_data, train_labels)
            tree = DecisionTree(boot_data,
                                boot_labels,
                                attributes,
                                p_threshold=p_max,
                                max_level=depth)
            y[i] = tree.classify(test_data)
        # Under zero-one loss the main prediction is the mode (least squares: mean, absolute loss: median)
        y_m = st.mode(y, 0)[0][0]
        # What's the overall test accuracy of our prediction: correct / (correct + incorrect)
        accuracy[depth - 1] = sum(np.asarray(y_m == y_star, int)) / len(y_star)
        # Bias: average zero-one loss between the optimal and main predictions
        bias[depth - 1] = np.mean(zero_one_loss(y_star, y_m))
        # Variance: average {across examples} of [(+1 if main = optimal, -1 otherwise) *
        #           average {across test datasets} zero-one loss between individual predictions and main prediction
        c2 = np.asarray(y_m == y_star,
                        dtype=int) * 2 - 1  # 1 if y_m == y_star, -1 otherwise
        loss_ym_y = np.array([zero_one_loss(y_m, y_i) for y_i in y])
        variance[depth - 1] = np.mean(c2 * np.mean(loss_ym_y, 0))

    # Plot Bias, Variance and overall accuracy
    plt.plot(depths, bias)
    plt.plot(depths, variance)
    plt.plot(depths, accuracy, ls='--')
    data = data.iloc[1:]
    # DATA PREPROCESSING
    data_convoluted = []
    labels = []
   
    # Slide a "SEGMENT_TIME_SIZE" wide window with a step size of "TIME_STEP"
    for i in range(0, len(data) - SEGMENT_TIME_SIZE, TIME_STEP):
        eF = data['ElbowFlexion'].values[i: i + SEGMENT_TIME_SIZE]
        eS = data['ElbowSupination'].values[i: i + SEGMENT_TIME_SIZE]
        sF = data['ShoulderFlexion'].values[i: i + SEGMENT_TIME_SIZE]
        sA = data['ShoulderAbduction'].values[i: i + SEGMENT_TIME_SIZE]
        sR = data['ShoulderRotation'].values[i: i + SEGMENT_TIME_SIZE]
        data_convoluted.append([eF, eS, sF, sA, sR])

        # Label for a data window is the label that appears most commonly
        label = stats.mode(data['Label'][i: i + SEGMENT_TIME_SIZE])[0][0]

        labels.append(label)

    print("Convoluted data shape: ", np.array(data_convoluted).shape)
    # Convert to numpy
    data_convoluted = np.asarray(data_convoluted, dtype=np.float32).transpose(0, 2, 1)
    # One-hot encoding
    labels = np.asarray(pd.get_dummies(labels), dtype=np.float32)
    #print("Convoluted data shape: ", data_convoluted.shape)
    #print("Labels shape:", labels.shape)

    # SPLIT INTO TRAINING AND TEST SETS
    X_train, X_test, y_train, y_test = train_test_split(data_convoluted, labels, test_size=0.3, random_state=RANDOM_SEED)
    #print("X train size: ", len(X_train))
    print("X test size: ", len(X_test))
"@Author: @learn.machinelearning"

import numpy as np
from scipy import stats

# Python code for Mean, Median, Mode
# Theory https://www.instagram.com/p/BtPtUJRHd16/

import numpy as np
from scipy import stats

vector_A = np.array([[1, 1, 2, 3, 4, 6, 18]])
#mean value
mean = np.mean(vector_A)

#median value
median = np.median(vector_A)

#mode value
mode = stats.mode(vector_A)
print("Mean: ", mean)
print("Median: ", median)
print("Mode: ", mode[1][0][0])
Example #58
0
"""
Created on Mon Feb 26 15:57:49 2018

@author: NI389899
"""

import pandas as pd
import numpy as np
from scipy import stats

#Reading excel file using pandas
dataset = pd.read_excel("Stats.xlsx")

exp_years = dataset.loc[:, "YearsOfExp"].values
salary = dataset.loc[:, "Salary in Rs."].values

#Getting mean,mode,median using numpy in-built functions
mean_exp = np.mean(exp_years)
mean_sal = np.mean(salary)
mode_exp = stats.mode(exp_years)
mode_sal = stats.mode(salary)
median_exp = np.median(exp_years)
median_sal = np.median(salary)

#Printing obtained values
print("Mean Years Of Experience: ", mean_exp)
print("Mean Salary: ", mean_sal)
print("Mode Years Of Experience: ", mode_exp[0][0])
print("Mode Salary: ", mode_sal[0][0])
print("Median Years Of Experience: ", median_exp)
print("Median Salary: ", median_sal)
Example #59
0
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from scipy.stats import mode
import numpy as np

df_train = pd.read_csv('/Users/tavleenkaur/Documents/fractal/train.csv')
test_df = pd.read_csv('/Users/tavleenkaur/Documents/fractal/test.csv')

all_items = df_train.Item_ID.unique()


for item in all_items:
	item_df = pd.DataFrame(df_train.loc[df_train['Item_ID'] == item, ['Datetime', 'Item_ID', 'ID',
							'Category_1', 'Category_2', 'Category_3', 'Price', 'Number_Of_Sales']])

	item_df['Category_2'].fillna(mode(item_df['Category_2']).mode[0], inplace=True)
	item_df['Category_3'].fillna(item_df['Category_3'].mode())
	item_df['Category_1'].fillna(item_df['Category_1'].mean(), inplace=True)
	item_df['Price'].fillna(item_df['Price'].mean()
	frames.append(item_df)

	# ----- For plotting ------
	# temp = {}	
	# for b in bins:
	# 	temp[b]= item_df.loc[item_df['bin'] == b, 'Number_Of_Sales'].sum()
	# plt.plot(list(temp.keys()), list(temp.values()))

final_df = pd.concat(frames)

# Initialise regression model
regr = linear_model.LinearRegression()
Example #60
0
target_x = tf.placeholder("float", [1,784]) #target vector
X = tf.placeholder("float", [None, 784]) #matrix of observations to compare to target
y = tf.placeholder("float", [None, 10]) #matrix of one-hot class vectors 

l1_dist= tf.reduce_sum(tf.abs(tf.sub(X, target_x)), 1)  #euclidean distance. the sum of squared differences between elements, row-wise.

l2_dist = tf.reduce_sum(tf.square(tf.sub(X, target_x)), 1)  #euclidean distance. the sum of squared differences between elements, row-wise.

#nn = tf.argmin(l1_dist, 0)
nn = tf.nn.top_k(-l1_dist, k)

init = tf.initialize_all_variables()
accuracy_history = []
with tf.Session() as sess:
	sess.run(init)
	for obs in range(X_test.shape[0]):
		nn_index = sess.run(nn, feed_dict = {X: X_train, y: y_train, target_x: np.asmatrix(X_test[obs])})
		pred_classes = []
		for i in range(k):
			nn_class = np.argmax(y_train[nn_index[1][i]])
			#print nn_class
			pred_classes.append(nn_class)
		predicted_class = stats.mode(pred_classes)[0][0]
		true_class = np.argmax(y_test[obs])
		print "True class: " + str(true_class) + ", predicted class: " + str(predicted_class)
		if predicted_class == true_class:
			accuracy_history.append(1)
		else:
			accuracy_history.append(0)

print "model was " + str(np.mean(accuracy_history)) + "% accurate"