def data_process(load_train, load_test): # 读取数据,设置需要处理的列头 df = pd.read_csv(load_train) df_test = pd.read_csv(load_test) feature_index = [ "BILL_1", "BILL_2", "BILL_3", "BILL_4", "BILL_5", "BILL_6", "AGE", "SEX_le", "EDUCATION_le", "MARRIAGE_le" ] encoder_index = ["SEX", "EDUCATION", "MARRIAGE", "RISK"] stande_index = ["BILL_1", "BILL_2", "BILL_3", "BILL_4", "BILL_5", "BILL_6"] # 数据标准化 for i in stande_index: df_s = StandardScaler().fit_transform(df[i][:, np.newaxis]) df[i + "_s"] = df_s df_s = df.drop(stande_index, axis=1) # 对每一个字符元素做LabelEncoder处理,并添加 for i in encoder_index: df_encoder = LabelEncoder().fit_transform(df_s[i]) df_s[i + "_le"] = df_encoder # 删除旧有的列元素 df_new = df_s.drop(encoder_index, axis=1) # 分割数据集 train_features, test_features, train_labels, test_labels = train_test_split( df_new[feature_index], df_new['RISK_le'], test_size=0.2, random_state=42) return train_features, test_features, train_labels, test_labels
def main(csv_path): df = pd.read_csv(csv_path) # Separating out the features X = df[df.columns[:-1]] # Separating out the target Y = df[df.columns[-1]] # Standardizing the features X_standard = StandardScaler().fit_transform(X) pca = PCA(n_components='mle') pca.fit_transform(X_standard) pca_1 = pd.DataFrame(pca.components_, columns=X.columns) output = pca_1.copy() output['explained_variance_ratio'] = pca.explained_variance_ratio_ print(output) X_standard = pd.DataFrame(X_standard, index=X.index, columns=X.columns) X_standard = X_standard.drop( pca_1.columns[pca_1.apply(lambda col: col[0] < 0)], axis=1) pca = PCA(n_components='mle') pca.fit_transform(X_standard) pca_2 = pd.DataFrame(pca.components_, columns=X_standard.columns) output = pca_2.copy() output['explained_variance_ratio'] = pca.explained_variance_ratio_ print(output)
def feature_selection(df, target): convert_dct = {'integer': 'int64', 'string': 'object', 'float': 'float64', 'boolean': 'bool', 'date-iso-8601': 'datetime64[ns]', 'date-eu': 'datetime64[ns]', 'date-non-std-subtype': 'datetime64[ns]', 'date-non-std': 'datetime64[ns]', 'gender': 'category', 'all-identical': 'category'} ptype = Ptype() ptype.run_inference(df) predicted = ptype.predicted_types count_normal_vars = 0 count_continuous_vars = 0 features = [] for key in predicted: # print(key, predicted[key]) if predicted[key] == 'int' or predicted[key] == 'float': features.append(key) x = df.loc[:, features].values x = StandardScaler().fit_transform(x) x = pd.DataFrame(x) x.columns = features X = x.drop(target, 1) # Feature Matrix y = x[target] # Target Variable # no of features nof_list = np.arange(1, len(features)) high_score = 0 # Variable to store the optimum features nof = 0 score_list = [] for n in range(len(nof_list)): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) model = LinearRegression() rfe = RFE(model, nof_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score): high_score = score nof = nof_list[n] # print("Optimum number of features: %d" % nof) # print("Score with %d features: %f" % (nof, high_score)) cols = list(X.columns) model = LinearRegression() # Initializing RFE model rfe = RFE(model, nof) # Transforming data using RFE X_rfe = rfe.fit_transform(X, y) # Fitting the data to model model.fit(X_rfe, y) temp = pd.Series(rfe.support_, index=cols) selected_features_rfe = temp[temp == True].index quality_measure = nof/len(features) return quality_measure
def _get_data(case): if case == 'case1': df_birth = pd.read_csv('dataset_birthwt.csv') df_part1 = StandardScaler().fit_transform(df_birth[['age', 'lwt']]) df_part2 = df_birth[['race', 'smoke', 'ptl', 'ht', 'ui', 'ftv']] feat_names = ['age', 'lwt'] + list(df_part2.columns) y = StandardScaler().fit_transform(df_birth['bwt'].values[:, None])[:, 0] X = np.hstack((df_part1, df_part2)) elif case == 'case2': df_prostate = pd.read_csv('dataset_prostate.csv') y = df_prostate['lpsa'] feat_names = [ 'lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45' ] X = StandardScaler().fit_transform(df_prostate[feat_names]) elif case == 'case3': bun = ds.load_diabetes() X, y = bun.data, bun.target X = StandardScaler().fit_transform(X) feat_names = bun.feature_names elif case == 'case4': df_fev = pd.read_csv('dataset_FEV.csv') df_fev.drop(labels='id', axis=1, inplace=True) feat_names = ['age', u'fev', u'height', u'sex', u'smoke'] df_part1 = pd.DataFrame(StandardScaler().fit_transform( df_fev[feat_names[:-2]].values), columns=feat_names[:-2]) df_part2 = pd.get_dummies(df_fev[feat_names[-2:]], drop_first=True) y = StandardScaler().fit_transform(df_part1['fev'].values[:, None])[:, 0] df_part1.drop(labels='fev', axis=1, inplace=True) X = np.hstack((df_part1.values, df_part2.values)) feat_names = list(df_part1.columns) + list(df_part2.columns) feat_names = feat_names[:-1] + ['smoker'] return X, y, feat_names
df = pd.read_csv(csv_path) # Separating out the features X = df[df.columns[:-1]] # Separating out the target Y = df[df.columns[-1]] # Standardizing the features X_standard = StandardScaler().fit_transform(X) pca = PCA(n_components='mle') pca.fit_transform(X_standard) pca_1 = pd.DataFrame(pca.components_, columns=X.columns) output = pca_1.copy() output['explained_variance_ratio'] = pca.explained_variance_ratio_ print(output) X_standard = pd.DataFrame(X_standard, index=X.index, columns=X.columns) X_standard = X_standard.drop( pca_1.columns[pca_1.apply(lambda col: col[0] < 0)], axis=1) pca = PCA(n_components='mle') pca.fit_transform(X_standard) pca_2 = pd.DataFrame(pca.components_, columns=X_standard.columns) output = pca_2.copy() output['explained_variance_ratio'] = pca.explained_variance_ratio_ print(output)
## Save output np.savetxt("{}Ha{}_{}_{}_{}.csv".format(NMFreg_output, K, alpha, l1_ratio, random_state), Ha, delimiter=",") np.savetxt("{}Wa{}_{}_{}_{}.csv".format(NMFreg_output, K, alpha, l1_ratio, random_state), Wa, delimiter=",") Ha_norm = StandardScaler(with_mean=False).fit_transform(Ha) Ha_norm = pd.DataFrame(Ha_norm) Ha_norm['barcode'] = atlasdge.index.tolist() maxloc = Ha_norm.drop('barcode', axis=1).values.argmax(axis=1) cell_clusters['maxloc'] = maxloc cell_clusters.head() # In[21]: #num_atlas_clusters = np.unique(cell_clusters['cluster']).size #### Interpret these factors to cell type assignments ##### f = open("{}Log.txt".format(NMFreg_output), 'a') f.write("Mapping Atlas Cells to Clusters\n") f.close() num_atlas_clusters = max(cell_clusters['cluster']) bins = num_atlas_clusters factor_to_celltype_df = pd.DataFrame(0, index=range(1, num_atlas_clusters + 1),
pca_fit = pca.fit_transform(df) #burası açıklama oranı oluyor bu sayede o bilggilerle ne kadar açıklayıcı olabildiğimizi görebiliyoruz pca.explained_variance_ratio_ #örenek yapıyorum df = pd.read_csv("diabetes.csv", sep=",") df = df.dropna() dms = pd.get_dummies(df[['Age', 'DiabetesPedigreeFunction', 'Insulin']]) y = df["Outcome"] #okunmayan değerleri silmem lazım df X_ = df.drop(['Outcome', 'Age', 'DiabetesPedigreeFunction', 'Insulin'], axis=1).astype('float64') X = pd.concat([X_, dms[['DiabetesPedigreeFunction', 'Insulin']]], axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) #model atanıyor lgb_model = LGBMRegressor().fit(X_train, y_train) lgb_model #tahmin yapılıyor y_pred = lgb_model.predict(X_test) y_pred
a_rob.append(b_rob) #df_num_norm['mahal_normal'] = a df_num_norm['mahal_rob'] = a_rob #df_v2['mahal_normal'] = a df_v2['mahal_rob'] = a_rob #a = pd.DataFrame(a) a_rob = pd.DataFrame(a_rob) #p = a.quantile(0.9) p_rob = a_rob.quantile(0.9) df_num_norm = df_num_norm[df_num_norm.mahal_rob < p_rob.iloc[0]] df_v2 = df_v2[df_v2.mahal_rob < p_rob.iloc[0]] df_num_norm = df_num_norm.drop(['mahal_rob'], axis=1) # In[] #Vectores y valores propios de la matriz de covarianza de la muestra eig_vals, eig_vecs = np.linalg.eig(np.array(cov_df)) print('Eigenvectors \n%s' % eig_vecs) print('\nEigenvalues \n%s' % eig_vals) #Hacemos una lista de parejas (autovector, autovalor) eig_pares = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))] #Ordenamos estas parejas den orden descendiente con la función sort eig_pares.sort(key=lambda x: x[0], reverse=True)
#Corremos KMeans kmeans = KMeans(n_clusters=7).fit(users) #Graficamos los rsultados de KMeans df_labeled = pd.DataFrame(kmeans.labels_, columns=list(['labels'])) df_labeled['labels'] = df_labeled['labels'].astype('category') plt.figure(figsize=(10, 8)) df_labeled['labels'].value_counts().plot.bar(color='y') plt.xlabel("Cluster") plt.ylabel("Número de clientes") plt.title("Número clientes por Cluster") #plt.show() #Agragamos los resultados las categorias del KMeans a users users = users.join(df_labeled) #Graicamos el dendograma plt.figure(figsize=(20, 10)) merg = linkage(users.drop('labels', 1), method='ward') dendrogram(merg, leaf_rotation=360) plt.title('Dendrogram') #plt.show() #Definimos el clusterin jerarquico hier_clus = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') cluster = hier_clus.fit_predict(users.drop('labels', 1)) #Agregamos las categorias del clustrein jerarqico users['Agg_label'] = cluster #Grafica del CH df_labeled = pd.DataFrame(hier_clus.labels_, columns=list(['labels'])) df_labeled['labels'] = df_labeled['labels'].astype('category') plt.figure(figsize=(10, 8)) df_labeled['labels'].value_counts().plot.bar(color='y')
# Backfill Volume data ############################################################################### if 1: # Get historical Volume data volume, rat_volume = backfill_volume_with_singular(currencies, granularity, _from, _to) volume.reset_index(inplace=True) volume = volume.rename({'index': 'timestamp'}, axis=1) volume.to_pickle(volume_path) # try standardized ss = StandardScaler().fit_transform(volume.iloc[:, 1:]) ss = pd.DataFrame(ss, columns=volume.columns[1:], index=volume.index) ss.insert(0, 'timestamp', volume.timestamp) ss.drop('jpy', axis=1, inplace=True) ss.drop('hkd', axis=1, inplace=True) cu, ratios = backfill_with_singular(currencies, granularity, _from, _to) cu.reset_index(inplace=True) cu = cu.rename({'index': 'timestamp'}, axis=1) cu.to_pickle(cu_small_path) ratios.reset_index(inplace=True) ratios = ratios.rename({'index': 'timestamp'}, axis=1) ############################################################################### # Update Volume and Diff ############################################################################### if 0:
def NMFreg( counts, coords, size, metacell_dict, gene_intersection, num_atlas_clusters, celltype_to_factor_dict, celltype_dict, plot_size_dict, ): puckcounts = counts[["barcode"] + gene_intersection] puckcounts = puckcounts.set_index(counts["barcode"]) puckcounts = puckcounts.drop("barcode", axis=1) cell_totalUMI = np.sum(puckcounts, axis=1) puckcounts_cellnorm = np.divide(puckcounts, cell_totalUMI[:, None]) puckcounts_scaled = StandardScaler( with_mean=False).fit_transform(puckcounts_cellnorm) XsT = puckcounts_scaled.T Hs_hat = [] for b in tqdm(range(XsT.shape[1])): h_hat = scipy.optimize.nnls(WaT, XsT[:, b])[0] if b == 0: Hs_hat = h_hat else: Hs_hat = np.vstack((Hs_hat, h_hat)) Hs = pd.DataFrame(Hs_hat) Hs["barcode"] = puckcounts.index.tolist() Hs_norm = StandardScaler(with_mean=False).fit_transform( Hs.drop("barcode", axis=1)) Hs_norm = pd.DataFrame(Hs_norm) Hs_norm["barcode"] = puckcounts.index.tolist() maxloc_s = Hs_norm.drop("barcode", axis=1).values.argmax(axis=1) barcode_clusters = pd.DataFrame() barcode_clusters["barcode"] = Hs_norm["barcode"] barcode_clusters["max_factor"] = maxloc_s barcode_clusters["atlas_cluster"] = barcode_clusters["barcode"] for c in range(1, num_atlas_clusters + 1): condition = np.isin(barcode_clusters["max_factor"], celltype_to_factor_dict[c]) barcode_clusters["atlas_cluster"][condition] = c bead_deconv_df = Hs_norm.apply( lambda x: deconv_factor_to_celltype( row=x, adict=factor_to_celltype_dict, K=K, num_atlas_clusters=num_atlas_clusters, ), axis=1, ) bead_deconv_df.insert(0, "barcode", Hs_norm["barcode"]) bead_deconv_df.columns = ["barcode" ] + (bead_deconv_df.columns[1:] + 1).tolist() bead_deconv_df = pd.DataFrame(bead_deconv_df) bead_deconv_df = bead_deconv_df.rename(columns=celltype_dict) maxloc_ct = bead_deconv_df.drop("barcode", axis=1).values.argmax(axis=1) + 1 bead_maxct_df = pd.DataFrame() bead_maxct_df["barcode"] = bead_deconv_df["barcode"] bead_maxct_df["max_cell_type"] = maxloc_ct return Hs, Hs_norm, puckcounts, bead_deconv_df, barcode_clusters, bead_maxct_df
# Create target Directory if don't exist model_dirname = folderPath + 'model/' if not os.path.exists(model_dirname): os.mkdir(model_dirname) print("Directory " , model_dirname , " Created ") else: print("Directory " , model_dirname , " already exists") model.save(folderPath + model_dirname + "model.h5") print("Saved model to disk") print('Preparing Testing data') X_test = pd.read_csv(dataPath + 'preprocessed_test.csv') y_train = pd.read_csv(dataPath + 'preprocessed_train_y.csv') # Remove ID X_test = X_test.drop('Id', axis=1) y_train = y_train.drop('Id', axis=1) # Devided input into 2 groups X_related_test = X_test[['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']] X_effected_test = X_test.drop(['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt'], axis=1) # Scale Data X_test = StandardScaler().fit_transform(X_test) X_related_test = StandardScaler().fit_transform(X_related_test) X_effected_test = StandardScaler().fit_transform(X_effected_test) scaler = StandardScaler().fit(y_train) print('Predicting...') result = model.predict([X_related_test, X_effected_test]) # Inverse transformation result = scaler.inverse_transform(result)
##standerdising X.describe() X = StandardScaler().fit_transform(X) features = X[X.columns[:5]] features_standard = StandardScaler().fit_transform( features) # Gaussian Standardisation X = pd.DataFrame(features_standard, columns=[['pgc', 'np', 'age', 'dpf', 'bmi']]) features = X[X.columns] features.describe() Y.info() X['class'].value_counts() Y = df['class'] X.drop("class", axis=1, inplace=True) #####lr model Xtrain, Xtest, ytrain, ytest = model_selection.train_test_split( X, Y, test_size=0.15, random_state=42) Xtrain.info() ytrain.value_counts() ytest.value_counts() lrmodel = linear_model.LogisticRegression() lrmodel.fit(Xtrain, ytrain) lrpredicted = lrmodel.predict(Xtest) printresult(ytest, lrpredicted) distances = lrmodel.decision_function(Xtest) precission, recall, thresh = metrics.precision_recall_curve(ytest, distances) plt.plot(thresh, precission[:-1], color="b") plt.plot(thresh, recall[:-1], color="g")