def resample(self): """ """ # Start with the minority class underx = self.x[self.y == self.minc] undery = self.y[self.y == self.minc] # Import the k-NN classifier from sklearn.neighbors import NearestNeighbors # Create a k-NN to fit the whole data nn_obj = NearestNeighbors(n_neighbors=self.size_ngh) # Fit the whole dataset nn_obj.fit(self.x) idx_to_exclude = [] # Loop over the other classes under picking at random for key in self.ucd.keys(): # Get the sample of the current class sub_samples_x = self.x[self.y == key] # Get the samples associated idx_sub_sample = np.nonzero(self.y == key)[0] # Find the NN for the current class nnhood_idx = nn_obj.kneighbors(sub_samples_x, return_distance=False) # Get the label of the corresponding to the index nnhood_label = (self.y[nnhood_idx] == key) # Check which one are the same label than the current class # Make an AND operation through the three neighbours nnhood_bool = np.logical_not(np.all(nnhood_label, axis=1)) # If the minority class remove the majority samples (as in politic!!!! ;)) if key == self.minc: # Get the index to exclude idx_to_exclude += nnhood_idx[np.nonzero(nnhood_label[np.nonzero(nnhood_bool)])].tolist() else: # Get the index to exclude idx_to_exclude += idx_sub_sample[np.nonzero(nnhood_bool)].tolist() # Create a vector with the sample to select sel_idx = np.ones(self.y.shape) sel_idx[idx_to_exclude] = 0 # Get the samples from the majority classes sel_x = np.squeeze(self.x[np.nonzero(sel_idx), :]) sel_y = self.y[np.nonzero(sel_idx)] underx = concatenate((underx, sel_x), axis=0) undery = concatenate((undery, sel_y), axis=0) if self.verbose: print("Under-sampling performed: " + str(Counter(undery))) return underx, undery
def resample(self): """ """ # Start with the minority class underx = self.x[self.y == self.minc] undery = self.y[self.y == self.minc] # Import the K-NN classifier from sklearn.neighbors import KNeighborsClassifier # Loop over the other classes under picking at random for key in self.ucd.keys(): # If the minority class is up, skip it if key == self.minc: continue # Randomly get one sample from the majority class maj_sample = sample(self.x[self.y == key], self.n_seeds_S) # Create the set C C_x = np.append(self.x[self.y == self.minc], maj_sample, axis=0) C_y = np.append(self.y[self.y == self.minc], [key] * self.n_seeds_S) # Create the set S S_x = self.x[self.y == key] S_y = self.y[self.y == key] # Create a k-NN classifier knn = KNeighborsClassifier(n_neighbors=self.size_ngh, **self.kwargs) # Fit C into the knn knn.fit(C_x, C_y) # Classify on S pred_S_y = knn.predict(S_x) # Find the misclassified S_y sel_x = np.squeeze(S_x[np.nonzero(pred_S_y != S_y), :]) sel_y = S_y[np.nonzero(pred_S_y != S_y)] underx = concatenate((underx, sel_x), axis=0) undery = concatenate((undery, sel_y), axis=0) if self.verbose: print("Under-sampling performed: " + str(Counter(undery))) return underx, undery
def evaluate_performance(self): # make a prediction X = self.test_X # np.expand_dims(self.test_X, axis=-1) yhat = self.model_type.value.predict(X) test_X = self.test_X.reshape((self.test_X.shape[0], self.test_X.shape[2])) # invert scaling for forecast inv_yhat = pd.concatenate((yhat, test_X[:, 1:]), axis=1) inv_yhat = self.transformer.inverse_transform(inv_yhat) inv_yhat = inv_yhat[:, 0] # invert scaling for actual test_y = self.test_y.reshape((len(self.test_y), 1)) inv_y = pd.concatenate((test_y, test_X[:, 1:]), axis=1) inv_y = self.transformer.inverse_transform(inv_y) inv_y = inv_y[:, 0] # calculate RMSE rmse = sqrt(mean_squared_error(inv_y, inv_yhat)) logging.debug('Test RMSE: %.3f' % rmse)
def resample(self): """ ??? :return: """ # Create the clustering object from sklearn.cluster import KMeans kmeans = KMeans(random_state=self.rs) kmeans.set_params(**self.kwargs) # Start with the minority class underx = self.x[self.y == self.minc] undery = self.y[self.y == self.minc] # Loop over the other classes under picking at random for key in self.ucd.keys(): # If the minority class is up, skip it. if key == self.minc: continue # Set the number of clusters to be no more than the number of # samples if self.ratio * self.ucd[self.minc] > self.ucd[key]: n_clusters = self.ucd[key] else: n_clusters = int(self.ratio * self.ucd[self.minc]) # Set the number of clusters and find the centroids kmeans.set_params(n_clusters=n_clusters) kmeans.fit(self.x[self.y == key]) centroids = kmeans.cluster_centers_ # Concatenate to the minority class underx = concatenate((underx, centroids), axis=0) undery = concatenate((undery, ones(n_clusters) * key), axis=0) if self.verbose: print("Under-sampling performed: " + str(Counter(undery))) return underx, undery
def resample(self): """ ... """ # Start with the minority class underx = self.x[self.y == self.minc] undery = self.y[self.y == self.minc] # Loop over the other classes under picking at random for key in self.ucd.keys(): # If the minority class is up, skip it if key == self.minc: continue # Set the ratio to be no more than the number of samples available if self.ratio * self.ucd[self.minc] > self.ucd[key]: num_samples = self.ucd[key] else: num_samples = int(self.ratio * self.ucd[self.minc]) # Pick some elements at random seed(self.rs) if self.replacement: indx = randint(low=0, high=self.ucd[key], size=num_samples) else: indx = sample(range((self.y == key).sum()), num_samples) # Concatenate to the minority class underx = concatenate((underx, self.x[self.y == key].iloc[indx]), axis=0) undery = concatenate((undery, self.y[self.y == key].iloc[indx]), axis=0) if self.verbose: print("Under-sampling performed: " + str(Counter(undery))) return underx, undery
def calc_returns(split_data): ''' Calculate annual returns for periods optimized over slices (of size HINDSIGHT) of past data. Gives an idea of what kind of results to realistically expect ''' annual_returns = [] max_return = float('-inf') min_return = float('inf') for i in range(2, len(split_data)): test_year = split_data[i] optimize_period = pd.concatenate(split_data[i - HINDSIGHT:i]) print('optimize period:') print(optimize_period) periods = optimize(optimize_period) print('periods:') print(periods) profit = run_analysis(periods, test_year) annual_returns.append(profit) if profit > max_return: max_return = profit if profit < min_return: min_return = profit return annual_returns, max_return, min_return
def gen_feature_dict(self): if not self.train_file or not self.test_file: raise Exception("provide file for train and test sets") if not self.numeric_cols: raise Exception("provide which columns are numeric") self.df_train = pd.read_csv(self.train_file) self.df_test = pd.read_csv(self.test_file) df = pd.concatenate([self.df_train, self.df_test]) for col in df.columns: if col in self.ignore_cols: continue if col in self.numeric_cols: self.feature_to_type[col] = 'numeric' else: le = LabelEncoder() le.fit(df[col]) self.feature_to_encoder[col] = le self.feature_to_type[col] = 'cat' self.columns.append(col) return self.df_train, self.df_test
def main(): parser = argparse.ArgumentParser( description="Reads benchmark_results filenames from rados bench and" \ " plots the results" ) parser.add_argument("--paths", nargs="+", required=False, default=["bench_results.txt"], help="The path of a file(s)") args = parser.parse_args() paths = args.paths if len(paths) <= 1: plt = plot_bench_results(paths[0]) else: dfs = [] for path in paths: print("path", path) df = bench_results_to_df(path) df = add_rolling_results(df) df["filename"] = path dfs.append(df) master_df = pd.concatenate(dfs, axis=1) master_df.plot(x="sec", y="MA_30s_ops")
def resample(self): """ Main method of all children classes. :return: Over-sampled data set. """ # Start by separating minority class features and target values. minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # If regular SMOTE is to be performed# if self.kind == 'regular': # Print if verbose is true# if self.verbose: print("Finding the %i nearest neighbours..." % self.k, end="") # Look for k-th nearest neighbours, excluding, of course, the # point itself.# self.nearest_neighbour_.fit(minx) # Matrix with k-th nearest neighbours indexes for each minority # element.# nns = self.nearest_neighbour_.kneighbors(minx, return_distance=False)[:, 1:] # Print status if verbose is true# if self.verbose: ## print("done!") # Creating synthetic samples # print("Creating synthetic samples...", end="") # --- Generating synthetic samples # Use static method make_samples to generate minority samples # FIX THIS SHIT!!!# sx, sy = self.make_samples(x=minx, nn_data=minx, y_type=self.minc, nn_num=nns, n_samples=int(self.ratio * len(miny)), step_size=1.0, random_state=self.rs, verbose=self.verbose) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis=0) ret_y = concatenate((self.y, sy), axis=0) return ret_x, ret_y if (self.kind == 'borderline1') or (self.kind == 'borderline2'): if self.verbose: print("Finding the %i nearest neighbours..." % self.m, end="") # Find the NNs for all samples in the data set. self.nearest_neighbour_.fit(self.x) if self.verbose: print("done!") # Boolean array with True for minority samples in danger danger_index = [self.in_danger(x, self.y, self.m, miny[0], self.nearest_neighbour_) for x in minx] # Turn into numpy array# danger_index = asarray(danger_index) # If all minority samples are safe, return the original data set. if not any(danger_index): ## if self.verbose: print('There are no samples in danger. No borderline ' 'synthetic samples created.') # All are safe, nothing to be done here.# return self.x, self.y # If we got here is because some samples are in danger, we need to # find the NNs among the minority class to create the new synthetic # samples. # # We start by changing the number of NNs to consider from m + 1 # to k + 1 self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(minx) # nns...# nns = self.nearest_neighbour_.kneighbors(minx[danger_index], return_distance=False)[:, 1:] # B1 and B2 types diverge here!!! if self.kind == 'borderline1': # Create synthetic samples for borderline points. sx, sy = self.make_samples(minx[danger_index], minx, miny[0], nns, int(self.ratio * len(miny)), random_state=self.rs, verbose=self.verbose) # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis=0) ret_y = concatenate((self.y, sy), axis=0) return ret_x, ret_y else: # Split the number of synthetic samples between only minority # (type 1), or minority and majority (with reduced step size) # (type 2). np.random.seed(self.rs) # The fraction is sampled from a beta distribution centered # around 0.5 with variance ~0.01# fractions = betavariate(alpha=10, beta=10) # Only minority sx1, sy1 = self.make_samples(minx[danger_index], minx, self.minc, nns, fractions * (int(self.ratio * len(miny)) + 1), step_size=1, random_state=self.rs, verbose=self.verbose) # Only majority with smaller step size sx2, sy2 = self.make_samples(minx[danger_index], self.x[self.y != self.minc], self.minc, nns, (1 - fractions) * int(self.ratio * len(miny)), step_size=0.5, random_state=self.rs, verbose=self.verbose) # Concatenate the newly generated samples to the original data set ret_x = np.concatenate((self.x, sx1, sx2), axis=0) ret_y = np.concatenate((self.y, sy1, sy2), axis=0) return ret_x, ret_y if self.kind == 'svm': # The SVM smote model fits a support vector machine # classifier to the data and uses the support vector to # provide a notion of boundary. Unlike regular smote, where # such notion relies on proportion of nearest neighbours # belonging to each class.# # Fit SVM to the full data# self.svm_.fit(self.x, self.y) # Find the support vectors and their corresponding indexes support_index = self.svm_.support_[self.y[self.svm_.support_] == self.minc] support_vector = self.x[support_index] # First, find the nn of all the samples to identify samples in danger # and noisy ones if self.verbose: print("Finding the %i nearest neighbours..." % self.m, end="") # As usual, fit a nearest neighbour model to the data self.nearest_neighbour_.fit(self.x) if self.verbose: print("done!") # Now, get rid of noisy support vectors # Boolean array with True for noisy support vectors noise_bool = [] for x in support_vector: noise_bool.append(self.is_noise(x, self.y, self.minc, self.nearest_neighbour_)) # Turn into array# noise_bool = asarray(noise_bool) # Remove noisy support vectors support_vector = support_vector[np.logical_not(noise_bool)] # Find support_vectors there are in danger (interpolation) or not # (extrapolation) danger_bool = [self.in_danger(x, self.y, self.m, self.minc, self.nearest_neighbour_) for x in support_vector] # Turn into array# danger_bool = asarray(danger_bool) # Something ...# safety_bool = np.logical_not(danger_bool) if self.verbose: print("Out of {0} support vectors, {1} are noisy, " "{2} are in danger " "and {3} are safe.".format(support_vector.shape[0], noise_bool.sum().astype(int), danger_bool.sum().astype(int), safety_bool.sum().astype(int) ) ) # Proceed to find support vectors NNs among the minority class print("Finding the %i nearest neighbours..." % self.k, end="") self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(minx) if self.verbose: print("done!") print("Creating synthetic samples...", end="") # Split the number of synthetic samples between interpolation and # extrapolation # The fraction are sampled from a beta distribution with mean # 0.5 and variance 0.01# np.random.seed(self.rs) fractions = betavariate(alpha=10, beta=10) # Interpolate samples in danger if (np.count_nonzero(danger_bool) > 0): nns = self.nearest_neighbour_.kneighbors(support_vector[danger_bool], return_distance=False)[:, 1:] sx1, sy1 = self.make_samples(support_vector[danger_bool], minx, self.minc, nns, fractions * (int(self.ratio * len(minx)) + 1), step_size=1, random_state=self.rs, verbose=self.verbose) # Extrapolate safe samples if (np.count_nonzero(safety_bool) > 0): nns = self.nearest_neighbour_.kneighbors(support_vector[safety_bool], return_distance=False)[:, 1:] sx2, sy2 = self.make_samples(support_vector[safety_bool], minx, self.minc, nns, (1 - fractions) * int(self.ratio * len(minx)), step_size=-self.out_step, random_state=self.rs, verbose=self.verbose) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set if ( (np.count_nonzero(danger_bool) > 0) and (np.count_nonzero(safety_bool) > 0) ): ret_x = concatenate((self.x, sx1, sx2), axis=0) ret_y = concatenate((self.y, sy1, sy2), axis=0) # not any support vectors in danger elif np.count_nonzero(danger_bool) == 0: ret_x = concatenate((self.x, sx2), axis=0) ret_y = concatenate((self.y, sy2), axis=0) # All the support vector in danger elif np.count_nonzero(safety_bool) == 0: ret_x = concatenate((self.x, sx1), axis=0) ret_y = concatenate((self.y, sy1), axis=0) return ret_x, ret_y
def resample(self): """ Over samples the minority class by randomly picking samples with replacement. :return: overx, overy: The features and target values of the over-sampled data set. """ # Start with the majority class overx = self.x[self.y == self.maxc] overy = self.y[self.y == self.maxc] # Loop over the other classes over picking at random for key in self.ucd.keys(): if key == self.maxc: continue # If the ratio given is too large such that the minority becomes a # majority, clip it. if self.ratio * self.ucd[key] > self.ucd[self.maxc]: num_samples = self.ucd[self.maxc] - self.ucd[key] else: num_samples = int(self.ratio * self.ucd[key]) if (self.method == 'replacement'): # Pick some elements at random seed(self.rs) indx = randint(low=0, high=self.ucd[key], size=num_samples) # Concatenate to the majority class overx = concatenate((overx, self.x[self.y == key], self.x[self.y == key].iloc[indx]), axis=0) overy = concatenate((overy, self.y[self.y == key], self.y[self.y == key].iloc[indx]), axis=0) elif (self.method == 'gaussian-perturbation'): # Pick the index of the samples which will be modified seed(self.rs) indx = randint(low=0, high=self.ucd[key], size=num_samples) # Generate the new samples sam_pert = [] for i in indx: pert = np.random.normal(self.mean_gaussian, self.std_gaussian, self.x[self.y == key][i]) sam_pert.append(self.x[self.y == key][i] + pert) # Convert the list to numpy array sam_pert = np.array(sam_pert) # Concatenate to the majority class overx = concatenate((overx, self.x[self.y == key], sam_pert), axis=0) overy = concatenate((overy, self.y[self.y == key], self.y[self.y == key].iloc[indx]), axis=0) if self.verbose: print("Over-sampling performed: " + str(Counter(overy))) # Return over sampled dataset return overx, overy
合并 pandas.merge(frame1,frame2,on='id') on属性指定以那一列进行合并 要合并多个键,就把对个键传给 on=['id','brand'] 列的名字不同 pandas.merge(frame1,frME2,left_on='id',right_on='sid') -----how属性指定链接方式 取值有 outer left right ---根据索引合并 将ringt_index和left_index的值改为True pd.merge(fr1,fr2,right_index=True,left_index=True) -----frame对象的 join()函数跟适合做 索引合并 fr1.join(fr2) 按照索引,列名不能一样====重点 ----拼接 函数 concatenate() ndarray 对象 pd.concatenate([array1,array2],axis=1) 列拼接 ----按轴拼接 series和DataFrame对象 pd.concat([ser1,ser2]) 默认axis=0 默认外链接,默认过滤缺失数据 属性join 的参数改变链接方式 pd.concat([ser1,ser2],axis=1,join='inner') 内连接 keys属性在拼接的轴上创建等级索引 pd.concat([ser1,ser2],asix=1,keys=[1,2]) 设置ser1和ser2数据名称 6.2.1 组合 我们无法通过合并和拼接组合数据,例如,两个数据集的索引 完全或部分重合 combine_first() 函数可以用组合series对象,同时对其数据 ser1.combine_first(ser2) 按ser1对其数据 部分合并 ser[1:3].combine_first(ser2[:3])
idx = df[ (df["Subject"] == "GreBla5671F") & (df["Date"] >= datetime.date(2020, 1, 7)) & (df["Date"] <= datetime.date(2020, 1, 10)) ].index df[idx, "Condition"] = "MonthLater" return df if __name__ == "__main__": from configs.active_config import config from analysis.download_scripts.project_lesions_2021 import download try: download() except: pass subject_dfs = [] for subject in config.subjects: # Preprocessing steps df = run_pipeline_subject(subject, config) subject_dfs.append(df) full_df = pd.concatenate(subject_dfs).reset_index(drop=True) full_df.to_csv( os.path.join(config.metadata_dir, "TrialsData.csv"), index=False )
def resample(self): """ Main method of all children classes. :return: Over-sampled data set. """ # Start by separating minority class features and target values. minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # If regular SMOTE is to be performed# if self.kind == 'regular': # Print if verbose is true# if self.verbose: print("Finding the %i nearest neighbours..." % self.k, end="") # Look for k-th nearest neighbours, excluding, of course, the # point itself.# self.nearest_neighbour_.fit(minx) # Matrix with k-th nearest neighbours indexes for each minority # element.# nns = self.nearest_neighbour_.kneighbors(minx, return_distance=False)[:, 1:] # Print status if verbose is true# if self.verbose: ## print("done!") # Creating synthetic samples # print("Creating synthetic samples...", end="") # --- Generating synthetic samples # Use static method make_samples to generate minority samples # FIX THIS SHIT!!!# sx, sy = self.make_samples(x=minx, nn_data=minx, y_type=self.minc, nn_num=nns, n_samples=int(self.ratio * len(miny)), step_size=1.0, random_state=self.rs, verbose=self.verbose) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis=0) ret_y = concatenate((self.y, sy), axis=0) return ret_x, ret_y if (self.kind == 'borderline1') or (self.kind == 'borderline2'): if self.verbose: print("Finding the %i nearest neighbours..." % self.m, end="") # Find the NNs for all samples in the data set. self.nearest_neighbour_.fit(self.x) if self.verbose: print("done!") # Boolean array with True for minority samples in danger danger_index = [ self.in_danger(x, self.y, self.m, miny[0], self.nearest_neighbour_) for x in minx ] # Turn into numpy array# danger_index = asarray(danger_index) # If all minority samples are safe, return the original data set. if not any(danger_index): ## if self.verbose: print('There are no samples in danger. No borderline ' 'synthetic samples created.') # All are safe, nothing to be done here.# return self.x, self.y # If we got here is because some samples are in danger, we need to # find the NNs among the minority class to create the new synthetic # samples. # # We start by changing the number of NNs to consider from m + 1 # to k + 1 self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(minx) # nns...# nns = self.nearest_neighbour_.kneighbors(minx[danger_index], return_distance=False)[:, 1:] # B1 and B2 types diverge here!!! if self.kind == 'borderline1': # Create synthetic samples for borderline points. sx, sy = self.make_samples(minx[danger_index], minx, miny[0], nns, int(self.ratio * len(miny)), random_state=self.rs, verbose=self.verbose) # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis=0) ret_y = concatenate((self.y, sy), axis=0) return ret_x, ret_y else: # Split the number of synthetic samples between only minority # (type 1), or minority and majority (with reduced step size) # (type 2). np.random.seed(self.rs) # The fraction is sampled from a beta distribution centered # around 0.5 with variance ~0.01# fractions = betavariate(alpha=10, beta=10) # Only minority sx1, sy1 = self.make_samples(minx[danger_index], minx, self.minc, nns, fractions * (int(self.ratio * len(miny)) + 1), step_size=1, random_state=self.rs, verbose=self.verbose) # Only majority with smaller step size sx2, sy2 = self.make_samples(minx[danger_index], self.x[self.y != self.minc], self.minc, nns, (1 - fractions) * int(self.ratio * len(miny)), step_size=0.5, random_state=self.rs, verbose=self.verbose) # Concatenate the newly generated samples to the original data set ret_x = np.concatenate((self.x, sx1, sx2), axis=0) ret_y = np.concatenate((self.y, sy1, sy2), axis=0) return ret_x, ret_y if self.kind == 'svm': # The SVM smote model fits a support vector machine # classifier to the data and uses the support vector to # provide a notion of boundary. Unlike regular smote, where # such notion relies on proportion of nearest neighbours # belonging to each class.# # Fit SVM to the full data# self.svm_.fit(self.x, self.y) # Find the support vectors and their corresponding indexes support_index = self.svm_.support_[self.y[self.svm_.support_] == self.minc] support_vector = self.x[support_index] # First, find the nn of all the samples to identify samples in danger # and noisy ones if self.verbose: print("Finding the %i nearest neighbours..." % self.m, end="") # As usual, fit a nearest neighbour model to the data self.nearest_neighbour_.fit(self.x) if self.verbose: print("done!") # Now, get rid of noisy support vectors # Boolean array with True for noisy support vectors noise_bool = [] for x in support_vector: noise_bool.append( self.is_noise(x, self.y, self.minc, self.nearest_neighbour_)) # Turn into array# noise_bool = asarray(noise_bool) # Remove noisy support vectors support_vector = support_vector[np.logical_not(noise_bool)] # Find support_vectors there are in danger (interpolation) or not # (extrapolation) danger_bool = [ self.in_danger(x, self.y, self.m, self.minc, self.nearest_neighbour_) for x in support_vector ] # Turn into array# danger_bool = asarray(danger_bool) # Something ...# safety_bool = np.logical_not(danger_bool) if self.verbose: print("Out of {0} support vectors, {1} are noisy, " "{2} are in danger " "and {3} are safe.".format( support_vector.shape[0], noise_bool.sum().astype(int), danger_bool.sum().astype(int), safety_bool.sum().astype(int))) # Proceed to find support vectors NNs among the minority class print("Finding the %i nearest neighbours..." % self.k, end="") self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(minx) if self.verbose: print("done!") print("Creating synthetic samples...", end="") # Split the number of synthetic samples between interpolation and # extrapolation # The fraction are sampled from a beta distribution with mean # 0.5 and variance 0.01# np.random.seed(self.rs) fractions = betavariate(alpha=10, beta=10) # Interpolate samples in danger if (np.count_nonzero(danger_bool) > 0): nns = self.nearest_neighbour_.kneighbors( support_vector[danger_bool], return_distance=False)[:, 1:] sx1, sy1 = self.make_samples(support_vector[danger_bool], minx, self.minc, nns, fractions * (int(self.ratio * len(minx)) + 1), step_size=1, random_state=self.rs, verbose=self.verbose) # Extrapolate safe samples if (np.count_nonzero(safety_bool) > 0): nns = self.nearest_neighbour_.kneighbors( support_vector[safety_bool], return_distance=False)[:, 1:] sx2, sy2 = self.make_samples(support_vector[safety_bool], minx, self.minc, nns, (1 - fractions) * int(self.ratio * len(minx)), step_size=-self.out_step, random_state=self.rs, verbose=self.verbose) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set if ((np.count_nonzero(danger_bool) > 0) and (np.count_nonzero(safety_bool) > 0)): ret_x = concatenate((self.x, sx1, sx2), axis=0) ret_y = concatenate((self.y, sy1, sy2), axis=0) # not any support vectors in danger elif np.count_nonzero(danger_bool) == 0: ret_x = concatenate((self.x, sx2), axis=0) ret_y = concatenate((self.y, sy2), axis=0) # All the support vector in danger elif np.count_nonzero(safety_bool) == 0: ret_x = concatenate((self.x, sx1), axis=0) ret_y = concatenate((self.y, sy1), axis=0) return ret_x, ret_y
columns=['file', 'xmin', 'ymin', 'xmax', 'ymax', 'conf', 'class']) detz = 0 for ff in range(len(valid_files)): # for ff in [213,686,856,867,956,967]: if ff % 1000 == 0: print(ff) image_name = valid_files[ff] prefix = image_name[:-4] if image_name[-4:] == ".png": image_in = cv2.imread(test_img_in + image_name) # print(test_img_in + image_name) # image_in = cv2.imread(valid_image_folder + image_name) dummy_array = np.zeros((1, 1, 1, 1, n_anchors, 4)) image_in = image_in / 255. image_in = image_in[:, :, ::-1] image_in = np.expand_dims(image_in, 0) netout = sess.run(y_pred, feed_dict={img_out: image_in}) netout = np.reshape(netout, [1, boxy, boxx, n_anchors, out_len]) boxes_pred = convert2box(netout) if len(boxes_pred) > 0: print(boxes_pred) boxes_pred["file"] = np.repeat(image_name, boxes_pred.shape[0]) detect_all = pd.concatenate((detect_all, boxes_pred), axis=0) detz = detz + boxes_pred.shape[0] print("Detections:", detz) detect_all.to_csv("E:/CF_Calcs/BenchmarkSets/GFRC/core_test/detz.csv", index=False)
""" create dummy dataframe about dragon ball z characters earth location and other information """ name_data_one = {"name": ["goku", "gohan"], "power": [200, 400], city": ["NY", "SEA"]} name_data_two = {"name": ["srijan", "chuck"], "power": [400, 500], city": ["DEN", "SFO"]} dragon_ball_data_one = pd.DataFrame(data=name_data_one) dragon_ball__data_two = pd.DataFrame(data=name_data_two) """ Concatenate two dataframes """ pd.concatenate([name_data_one, name_data_two], axis=0) #concatenate along rows - stack vertically pd.concatenate([name_data_one, name_data_two], axis=1) #concatenate along column - stack horizontally """ Join/Merge two dataframes """ pd.merge(name_data_one, name_data_two, on = "name", how="inner") """ Loop over dataframes
def resample(self): """ """ # Start with the minority class underx = self.x[self.y == self.minc] undery = self.y[self.y == self.minc] # Import the K-NN classifier from sklearn.neighbors import KNeighborsClassifier # Loop over the other classes under picking at random for key in self.ucd.keys(): # If the minority class is up, skip it if key == self.minc: continue # Randomly get one sample from the majority class maj_sample = sample(self.x[self.y == key], self.n_seeds_S) # Create the set C C_x = np.append(self.x[self.y == self.minc], maj_sample, axis=0) C_y = np.append(self.y[self.y == self.minc], [key] * self.n_seeds_S) # Create the set S S_x = self.x[self.y == key] S_y = self.y[self.y == key] # Create a k-NN classifier knn = KNeighborsClassifier(n_neighbors=self.size_ngh, **self.kwargs) # Fit C into the knn knn.fit(C_x, C_y) # Classify on S pred_S_y = knn.predict(S_x) # Find the misclassified S_y sel_x = np.squeeze(S_x[np.nonzero(pred_S_y != S_y), :]) sel_y = S_y[np.nonzero(pred_S_y != S_y)] underx = concatenate((underx, sel_x), axis=0) undery = concatenate((undery, sel_y), axis=0) from sklearn.neighbors import NearestNeighbors # Find the nearest neighbour of every point nn = NearestNeighbors(n_neighbors=2) nn.fit(underx) nns = nn.kneighbors(underx, return_distance=False)[:, 1] # Send the information to is_tomek function to get boolean vector back if self.verbose: print("Looking for majority Tomek links...") links = self.is_tomek(undery, nns, self.minc, self.verbose) if self.verbose: print("Under-sampling " "performed: " + str(Counter(undery[logical_not(links)]))) # Return data set without majority Tomek links. return underx[logical_not(links)], undery[logical_not(links)]
def resample(self): """ """ # Start with the minority class underx = self.x[self.y == self.minc] undery = self.y[self.y == self.minc] # Import the k-NN classifier from sklearn.neighbors import NearestNeighbors # Create a k-NN to fit the whole data nn_obj = NearestNeighbors(n_neighbors=self.size_ngh) # Fit the whole dataset nn_obj.fit(self.x) idx_to_exclude = [] # Loop over the other classes under picking at random for key in self.ucd.keys(): # Get the sample of the current class sub_samples_x = self.x[self.y == key] # Get the samples associated idx_sub_sample = np.nonzero(self.y == key)[0] # Find the NN for the current class nnhood_idx = nn_obj.kneighbors(sub_samples_x, return_distance=False) # Get the label of the corresponding to the index nnhood_label = (self.y[nnhood_idx] == key) # Check which one are the same label than the current class # Make an AND operation through the three neighbours nnhood_bool = np.logical_not(np.all(nnhood_label, axis=1)) # If the minority class remove the majority samples (as in politic!!!! ;)) if key == self.minc: # Get the index to exclude idx_to_exclude += nnhood_idx[np.nonzero( nnhood_label[np.nonzero(nnhood_bool)])].tolist() else: # Get the index to exclude idx_to_exclude += idx_sub_sample[np.nonzero( nnhood_bool)].tolist() # Create a vector with the sample to select sel_idx = np.ones(self.y.shape) sel_idx[idx_to_exclude] = 0 # Get the samples from the majority classes sel_x = np.squeeze(self.x[np.nonzero(sel_idx), :]) sel_y = self.y[np.nonzero(sel_idx)] underx = concatenate((underx, sel_x), axis=0) undery = concatenate((undery, sel_y), axis=0) if self.verbose: print("Under-sampling performed: " + str(Counter(undery))) return underx, undery
def resample(self): """ """ # Start with the minority class underx = self.x[self.y == self.minc] undery = self.y[self.y == self.minc] # For each element of the current class, find the set of NN # of the minority class from sklearn.neighbors import NearestNeighbors # Call the constructor of the NN nn_obj = NearestNeighbors(n_neighbors=self.size_ngh, **self.kwargs) # Fit the minority class since that we want to know the distance # to these point nn_obj.fit(self.x[self.y == self.minc]) # Loop over the other classes under picking at random for key in self.ucd.keys(): # If the minority class is up, skip it if key == self.minc: continue # Set the ratio to be no more than the number of samples available if self.ratio * self.ucd[self.minc] > self.ucd[key]: num_samples = self.ucd[key] else: num_samples = int(self.ratio * self.ucd[self.minc]) # Get the samples corresponding to the current class sub_samples_x = self.x[self.y == key] sub_samples_y = self.y[self.y == key] if self.version == 1: # Find the NN dist_vec, idx_vec = nn_obj.kneighbors(sub_samples_x, n_neighbors=self.size_ngh) # Select the right samples sel_x, sel_y = self.__SelectionDistBased__(dist_vec, num_samples, key, sel_strategy='nearest') elif self.version == 2: # Find the NN dist_vec, idx_vec = nn_obj.kneighbors(sub_samples_x, n_neighbors=self.y[self.y == self.minc].size) # Select the right samples sel_x, sel_y = self.__SelectionDistBased__(dist_vec, num_samples, key, sel_strategy='nearest') elif self.version == 3: # We need a new NN object to fit the current class nn_obj_cc = NearestNeighbors(n_neighbors=self.ver3_samp_ngh, **self.kwargs) nn_obj_cc.fit(sub_samples_x) # Find the set of NN to the minority class dist_vec, idx_vec = nn_obj_cc.kneighbors(self.x[self.y == self.minc]) # Create the subset containing the samples found during the NN # search. Linearize the indexes and remove the double values idx_vec = np.unique(idx_vec.reshape(-1)) # Create the subset sub_samples_x = sub_samples_x[idx_vec, :] sub_samples_y = sub_samples_y[idx_vec] # Compute the NN considering the current class dist_vec, idx_vec = nn_obj.kneighbors(sub_samples_x, n_neighbors=self.size_ngh) sel_x, sel_y = self.__SelectionDistBased__(dist_vec, num_samples, key, sel_strategy='farthest') underx = concatenate((underx, sel_x), axis=0) undery = concatenate((undery, sel_y), axis=0) if self.verbose: print("Under-sampling performed: " + str(Counter(undery))) return underx, undery
epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False) # plot history plt.plot(history.history['loss'], label='train') plt.plot(history.history['val_loss'], label='test') plt.legend() plt.show() # make a prediction yhat = model.predict(test_X) test_X = test_X.reshape((test_X.shape[0], n_hours * n_features)) # invert scaling for forecast inv_yhat = pd.concatenate((yhat, test_X[:, -7:]), axis=1) inv_yhat = X_scaler.inverse_transform(inv_yhat) inv_yhat = inv_yhat[:, 0] # invert scaling for actual test_y = test_y.reshape((len(test_y), 1)) inv_y = concatenate((test_y, test_X[:, -7:]), axis=1) inv_y = scaler.inverse_transform(inv_y) inv_y = inv_y[:, 0] # calculate RMSE rmse = sqrt(mean_squared_error(inv_y, inv_yhat)) print('Test RMSE: %.3f' % rmse) ''' # 모델의 설정, 컴파일, 실행 for train_index, validation_index in kf.split(X): # 이하 모델을 학습한 뒤 테스트. print("loop num : ", len(accuracy)+1) print("TRAIN: %d" % len(train_index), "TEST: %d" % len(validation_index))
def prepare_dect2(csv_path, image_source_dir, data_save_dir, df_train_vin, df_val_vin,\ k=5, heatmap=True, txt_prefix='dec2_', yaml_name='detect2.yaml', nc=6, ext='.png'): class_names = [ 'Atelectasis', 'Cardiomegaly', 'Infiltration', 'Nodule/Mass', 'Pleural effusion', 'Pneumothorax' ] class_map = dict(Atelectasis=0, Cardiomegaly=1, Infiltrate=2, Mass=3, Nodule=3, Effusion=4, Pneumothorax=5) # adding features to the dataframe print('preparing csv file ...') df = prepare_nih_bbox_csv(csv_path, image_source_dir, class_map) print('done preparing ^^') print() # spliting train / val dataset print('spliting train val dataset ...') df = stratified_kfold_split(df, k=k, heatmap=heatmap) print('done spliting data ^^') print() fold = 0 df_train = df[df.fold != fold] df_val = df[df.fold == fold] # preparing train / val dirs img_train_dir = os.path.join(data_save_dir, 'images', 'train') img_val_dir = os.path.join(data_save_dir, 'images', 'val') label_train_dir = os.path.join(data_save_dir, 'labels', 'train') label_val_dir = os.path.join(data_save_dir, 'labels', 'val') os.makedirs(img_train_dir, exist_ok=True) os.makedirs(img_val_dir, exist_ok=True) os.makedirs(label_train_dir, exist_ok=True) os.makedirs(label_val_dir, exist_ok=True) # copying images to the appropriate dirs # creating .txt labels files print('segregating data ...') segregate_data(df_train, img_train_dir, label_train_dir) segregate_data(df_val, img_val_dir, label_val_dir) print('done segregating data ^^') print() df_train['image_new_path'] = df.image_id.apply( lambda x: os.path.join(img_train_dir, x + ext)) df_val['image_new_path'] = df.image_id.apply( lambda x: os.path.join(img_val_dir, x + ext)) print('filtering vin data ...') df_train_vin = filter_vin_to_nih(df_train_vin) df_val_vin = filter_vin_to_nih_df(df_val_vin) print('done filtering vin data ^^') print() # concatenate each pair of dataframes print('concatenating dataframes ...') df_train = pd.concatenate([df_train, df_train_vin], axis=0, ignore_index=True) df_val = pd.concatenate([df_val, df_val_vin], axis=0, ignore_index=True) print('done concatenating ^^') print() # prepare .txt files train_txt = os.path.join(data_save_dir, txt_prefix + 'train.txt') val_txt = os.path.join(data_save_dir, txt_prefix + 'val.txt') print('preparing .txt files ...') prepare_txt(train_txt, df_train.image_new_path.unique()) prepare_txt(val_txt, df_val.image_new_path.unique()) print('done preparing .txt files ^^') print() # prepare .yaml file print('preparing .yaml files ...') prepare_yaml(data_save_dir, yaml_name, (train_txt, val_txt), nc, class_names) return df_train, df_val
bsobj.findAll('cite')[3].get_text(), bsobj.findAll('cite')[4].get_text(), bsobj.findAll('a')[0].attrs['href'], bsobj.findAll('a')[0].get_text(), bsobj.findAll('a')[1].attrs['href'], bsobj.findAll('a')[1].attrs['title'], bsobj.findAll('cite')[2].get_text(), bsobj.findAll('a')[2].attrs['href'] if bool(bsobj.findAll('cite')[2].find('a')) else None ] for bsobj in bsObj.find("ul",{"class":"newlist"}).findAll("li")],columns=urls_colnames) return temp_array pool=Pool() total_list=pool.map(get_urls_info,raw_pool) temp=pd.concatenate(np.array(total_list),axis=0) temp.回复数=temp.回复数.apply(lambda x:int(x)) temp.to_csv("C:/Users/User/Desktop/华南BOSS/NLP+策略/store.csv") temp1=temp[temp.回复数>4] url_pool=[i for i in temp1["帖子内链"]] sub_pool=[url_pool[4]] sub_pool.append(url_pool[22]) #===================返回小吧帖子信息===================== def get_suburls_info(url): resp = requests.get(url ,headers = headers) bsObj=BeautifulSoup(resp.text,"lxml") temp_array1=np.array([[ int(bsobj.findAll('span')[0].get_text()), int(bsobj.findAll('span')[1].get_text()), bsobj.find('span',{'class':'l6'}).get_text(),#发帖时间
def resample(self): """ """ # Start with the minority class underx = self.x[self.y == self.minc] undery = self.y[self.y == self.minc] # For each element of the current class, find the set of NN # of the minority class from sklearn.neighbors import NearestNeighbors # Call the constructor of the NN nn_obj = NearestNeighbors(n_neighbors=self.size_ngh, **self.kwargs) # Fit the minority class since that we want to know the distance # to these point nn_obj.fit(self.x[self.y == self.minc]) # Loop over the other classes under picking at random for key in self.ucd.keys(): # If the minority class is up, skip it if key == self.minc: continue # Set the ratio to be no more than the number of samples available if self.ratio * self.ucd[self.minc] > self.ucd[key]: num_samples = self.ucd[key] else: num_samples = int(self.ratio * self.ucd[self.minc]) # Get the samples corresponding to the current class sub_samples_x = self.x[self.y == key] sub_samples_y = self.y[self.y == key] if self.version == 1: # Find the NN dist_vec, idx_vec = nn_obj.kneighbors( sub_samples_x, n_neighbors=self.size_ngh) # Select the right samples sel_x, sel_y = self.__SelectionDistBased__( dist_vec, num_samples, key, sel_strategy='nearest') elif self.version == 2: # Find the NN dist_vec, idx_vec = nn_obj.kneighbors( sub_samples_x, n_neighbors=self.y[self.y == self.minc].size) # Select the right samples sel_x, sel_y = self.__SelectionDistBased__( dist_vec, num_samples, key, sel_strategy='nearest') elif self.version == 3: # We need a new NN object to fit the current class nn_obj_cc = NearestNeighbors(n_neighbors=self.ver3_samp_ngh, **self.kwargs) nn_obj_cc.fit(sub_samples_x) # Find the set of NN to the minority class dist_vec, idx_vec = nn_obj_cc.kneighbors( self.x[self.y == self.minc]) # Create the subset containing the samples found during the NN # search. Linearize the indexes and remove the double values idx_vec = np.unique(idx_vec.reshape(-1)) # Create the subset sub_samples_x = sub_samples_x[idx_vec, :] sub_samples_y = sub_samples_y[idx_vec] # Compute the NN considering the current class dist_vec, idx_vec = nn_obj.kneighbors( sub_samples_x, n_neighbors=self.size_ngh) sel_x, sel_y = self.__SelectionDistBased__( dist_vec, num_samples, key, sel_strategy='farthest') underx = concatenate((underx, sel_x), axis=0) undery = concatenate((undery, sel_y), axis=0) if self.verbose: print("Under-sampling performed: " + str(Counter(undery))) return underx, undery