def cut_series(data,time,fs): cut_length=time*fs number=int(len(data)/cut_length) #print(number) for i in range(0,number): cut=data[i*cut_length:(i+1)*cut_length].reshape(-1,1) cut=MinMaxScaler(feature_range=(0,1)).fit_transform(cut) #print(cut) #plt.plot(cut) #plt.show() x_data.append(cut.flatten())
def read_single_data(type,number): file_path=PATH+type+number record=wfdb.rdrecord(file_path,channels=[0])#读取 rdata=record.p_signal.flatten() data=denoise(rdata)#去噪 new_data=np.array(data) new_data=new_data.reshape(-1,1) new_data=MinMaxScaler(feature_range=(0,1)).fit_transform(new_data) #print(new_data) #plt.plot(new_data) #plt.show() x_data.append(new_data.flatten())
def rescaler(series, max_range_old, min_range_old, max_range_new, min_range_new, invert_sign=False): """Rescales a series, while maintaining the distance of the highest and lowest value in the data to the highest and lowest vlaue of the desired rnage.""" from sklearn.preprocessing import MinMaxScaler min_series_old = series.min() max_series_old = series.max() # print(max_series) minmax_to_mid_old = abs(max_range_old) - abs( (max_range_old + min_range_old) / 2) minmax_to_mid_new = abs(max_range_new) - abs( (max_range_new + min_range_new) / 2) # Calculate how big the max and min in the series are compared to the # maximal values possible diff_range_series_max_old = abs(max_range_old) - abs(max_series_old) diff_range_series_min_old = abs(min_range_old) - abs(min_series_old) # frac_to_max = diff_range_series_max_old / minmax_to_mid_old frac_to_min = diff_range_series_min_old / minmax_to_mid_old # diff_range_series_max_new = frac_to_max * minmax_to_mid_new diff_range_series_min_new = frac_to_min * minmax_to_mid_new # max_series_new = max_range_new - diff_range_series_max_new min_series_new = min_range_new - diff_range_series_min_new # Scale the data series = MinMaxScaler( feature_range=[min_series_new, max_series_new]).fit_transform( series.to_numpy().reshape(-1, 1)) series = pd.Series(series.flatten()) # print(series) # Aridity has to be inverted as a value of 1 in the files provided by wouter # is equal to - 1 in regard to the value it his paper. if invert_sign: for i in series.index: print(series[i]) if series[i] > 0: series[i] = 0 - series[i] else: series[i] = abs(series[i]) print(series) return series
def calc_pca(self, write=True, dom_data=(None, None), explain_fa_dfilepath=None): """ Method to perform Principal Component Analysis. Steps performed in this function are referenced from the following source - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5065523/ Args: write: Boolean variable to change whether output is written to a file or not (True: writes output to file) dom_data(None, None): contains the flag and data that is passed, if PCA is being performed on each domain dom_data[0]: Flag to show if domains are present dom_data[1]: Contains domain data subset explain_fa_filepath: contains file path for writing factor analysis results per domain Returns: scores: list of scores for towns in alphabetical order """ # Check if the function is acting on domain data, if True, then set Kaiser condition cutoff to 0.01, # else the cutoff is 1.0 data = None cut_off = None if dom_data[0]: data = dom_data[1] cut_off = 1e-2 else: data = pd.read_csv(self.data, index_col=0) cut_off = 1.0 # Convert data to numpy array for further processing loaded_data = np.array(data) #find the number of variables num_var = loaded_data.shape[1] #calc is the cutoff to find which eigenvector correlations are significant calc = np.abs(np.sqrt(1 / num_var)) #cov_mat is the covariance matrix of the data cov_mat = np.cov(loaded_data.T) #perform PCA on the covariance matrix self.pca = PCA() self.pca.fit(cov_mat) # kaiser criterion : Components with eigen_values > cut_off should be retained selected_components = np.argwhere( self.pca.explained_variance_ > cut_off).flatten() selected_vr = self.pca.explained_variance_ratio_[selected_components] # second criterion : Among selected components, retain those with proportion of variance > 10% mod_idxs = np.argwhere(selected_vr.flatten() > 0.1) selected_vr = selected_vr[selected_vr > 0.1] selected_components = selected_components[mod_idxs].flatten() # Check number of selected PCs # If selected PCs are less than 1 because both of the above conditions aren't met, change PCs to 1 self.n = len(selected_components) if self.n < 1: self.n = 1 # Weights assigned to every PC weights = np.exp( (np.log(self.pca.explained_variance_) - np.log(logsumexp(self.pca.explained_variance_[:self.n])))) # Choose only n selected PCs weights = weights[:self.n] # Keep only the eigenvector correlations that satisfy the indicator variable loading cutoff self.var_load = self.pca.components_.T[:, :self.n] self.var_load[self.var_load > calc] = 0 # Calculate feature scores for every feature factor_scores = self.var_load @ weights # Calculate the health score for every town by multiplying the original data value to the feature score. # The final health score for every town is a linear combination of the health scores health_status = np.array([loaded_data @ factor_scores]).T health_status = MinMaxScaler().fit_transform(health_status) health_status = health_status.flatten() # Round scores to 2 decimal places scores = [round(x, 2) for x in health_status] # Get town names towns = self.extract_towns() if (write == True): # Combine the towns, health scores to write to a file health_scores = sorted(zip(towns, scores), key=lambda x: x[1]) with open(self.output, "w", newline="") as f: writer = csv.writer(f) writer.writerow(['Town', 'Health Score']) writer.writerows(health_scores) return scores
s_rel, m_rel = sce, np.copy(cle) m_rel[m_rel==2]=-1 m_rel[m_rel==1]=-1 m_rel[m_rel==3]=1 m_rel[m_rel==4]=1 m_rel[m_rel==0]=1 rel = performance(s_rel,m_rel) print(alg, loc, rel, file=open(outpath+'ex_Loc_Rel.txt', "a")) print("Locality: ", loc, ", Relativity: ", rel) fig = plt.figure(1, figsize=(4, 4), dpi=100,) sc = -1/(1+scores[training_len:]) sc = sc.reshape(-1, 1) sc2 = MinMaxScaler().fit_transform(sc) plt.scatter(data[training_len:,0],data[training_len:,1], s=1, cmap='coolwarm', c=sc2.flatten()) plt.colorbar(ticks=[np.min(sc2), np.max(sc2)]) plt.title(alg+' scores', fontsize=14) plt.savefig(outpath+'ex'+alg+'.png') fig = plt.figure(2, figsize=(4, 4), dpi=100,) plt.scatter(data[training_len:,0],data[training_len:,1], s=1, cmap='Reds', c=scbin.flatten()) plt.colorbar(ticks=[np.min(scbin), np.max(scbin)]) plt.title(alg+' top 100 outliers', fontsize=14) plt.savefig(outpath+'ex_bin_'+alg+'.png') img = Image.open(outpath+'ex_bin_'+alg+'.png') left, top, right, bottom = 0, 0, 313, 400 img_res = img.crop((left, top, right, bottom)) img_fin = Image.new("RGB", (400,400), (255, 255, 255)) img_fin.paste(img_res, (left, top, right, bottom)) img_fin.save(outpath+'ex_bin_'+alg+'.png')
def preprocess_df(self, df): # temp = df[['id','original_mql_at','joined_at','converted']] if self.config['metric'] == 'conversion time': df = df.loc[df['converted'] == True] df['join_days'] = df.apply(lambda x: abs( pd.Timedelta(x['joined_at'] - x['original_mql_at']).days) if x['converted'] == True else np.NaN, axis=1) days1 = df.loc[df['converted'] == True, 'join_days'].values fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax = sns.kdeplot(days1) fig.savefig( os.path.join(self.dir_['result'], 'hist_conversion_days.png')) days1 = np.log(days1 + 1) fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax = sns.kdeplot(days1) fig.savefig( os.path.join(self.dir_['result'], 'log_hist_conversion_days.png')) employees = df['number_of_employees'].values fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax = sns.kdeplot(employees) fig.savefig(os.path.join(self.dir_['result'], 'log_hist_employees.png')) df['plan cost'] = 39.0 + df['lead_signup_size'] * 6.0 plancost = df['plan cost'].values fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax = sns.kdeplot(plancost) fig.savefig(os.path.join(self.dir_['result'], 'log_hist_plancost.png')) # to_log = ['join_days','number_of_employees', 'plan cost', 'lead_signup_size'] # to_log = ['join_days'] # df[to_log] = df[to_log].applymap(lambda x: np.log(x + 1)) #log transform the data, # cats = pd.qcut(days1, 4) # print pd.value_counts(cats) # # cats = pd.cut(days1, 4) # print pd.value_counts(cats) # sys.exit(0) days1 = days1[:, np.newaxis] days1_idx = df.loc[df['converted'] == True, 'id'].values days1 = MinMaxScaler().fit_transform(days1) days1 = days1.flatten() days1 = pd.qcut(days1, 4, labels=False) # print np.max(days1), np.min(days1) days1 = 4 - days1 days0_idx = df.loc[df['converted'] == False, 'id'].values y0 = np.zeros(days0_idx.shape[0]) y = np.concatenate([days1, y0]) idx = np.concatenate([days1_idx, days0_idx]) temp2 = pd.DataFrame({'id': idx, 'Class': y}) df = pd.merge(left=df, right=temp2.set_index('id', drop=True), left_on=['id'], how='left', right_index=True) df['Class'] = df['Class'].astype(int) df = self.process_x(df) return df