def drop_minus_row(self, pd, target): ''' マイナスの値をとる行を削除する関数 @param pd 評価対象データセット @param target 評価対象列(この列のマイナス値が含まれる行を削除する) ''' return pd.drop(pd[pd[target] <= 0].index) self.cust.drop(self.cust[self.cust['生年月日'].str.match('\*', na=False)].index)
def normalize_df(df): #df=pd.DataFrame(df) # df["short_term_mean"] = df.number_of_times.rolling(6).mean() where_outliers_be = np.abs(df.number_of_times - df.number_of_times.mean()) >= ( 3 * df.number_of_times.std()) df.number_of_times[where_outliers_be] = df.short_term_mean[ where_outliers_be] normalized_df = (df - df.mean()) / (df.max() - df.min()) normalized_df = pd.drop(normalized_df.short_term_mean, 1) return normalized_df
def drop_irrelevant_y_var(self, pd, price_exist): columns_to_drop = list() for i in range(len(self.input_tech_indicators_list)): columns_to_drop.append(self.input_tech_indicators_list[i].upper() + "(t)") for i in range(len(self.input_tech_indicators_list)): for j in range(1, self.n_seq): columns_to_drop.append(self.input_tech_indicators_list[i].upper() + "(t+%d)" % j) if not price_exist: for i in range(self.n_lag): columns_to_drop.append("Share Price(t-%d)" % (i+1)) #print("columns_to_drop", columns_to_drop) return pd.drop(columns_to_drop, axis=1)
df = pd.read_csv(infile_dt, sep=' ', engine='python', usecols=[0,1,2,3,4], header=None, skiprows=[0], names=colnames, index_col=[0]) df # ================== # === data analysis === # --- Compute 3D DZU with partial bottom cell corrected, copied from Steve's github --- # (https://github.com/sgyeager/POP_MOC/blob/main/pop_moc_0p1deg.py) # and commented a few lines. Need a POP history file for 3D DZU in_file = infile_uvel ds = xr.open_dataset(in_file) pd = ds['PD'] pd=pd.drop(['ULAT','ULONG']) # this is a python bug that we are correcting temp = ds['TEMP'] temp=temp.drop(['ULAT','ULONG']) # salt = ds['SALT'] # salt=salt.drop(['ULAT','ULONG']) # u_e = ds['UVEL']/100 # u_e=u_e.drop(['TLAT','TLONG']) # u_e.attrs['units']='m/s' # v_e = ds['VVEL']/100 # v_e=v_e.drop(['TLAT','TLONG']) # v_e.attrs['units']='m/s' # w_e = ds['WVEL']/100 # w_e=w_e.drop(['ULAT','ULONG']) # w_e.attrs['units']='m/s' ulat = ds['ULAT'] ulon = ds['ULONG']
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.feature_selection import VarianceThreshold pd=pd.read_csv("train_label.csv") constant_filter = VarianceThreshold(threshold=0.05) constant_filter.fit(pd) len(pd.columns[constant_filter.get_support()]) constant_columns = [column for column in pd.columns if column not in pd.columns[constant_filter.get_support()]] pd.drop(labels=constant_columns, axis=1, inplace=True) print(constant_columns) pd.to_csv("train_label_p.csv",index=False)
def _divide_pd(self, pd): sample = np.random.choice( a=pd.index, size=int(len(pd)*cfg.params['train']), replace=False) return pd.ix[sample], pd.drop(sample)
import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.neighbors import KNeighborsClassifier #process the data columns_name = ["Sample code number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class"] pd = pd.read_csv('breast-cancer-wisconsin.data.txt', header= None, names= columns_name) pd2 = pd.drop('Sample code number', axis=1) pd2.replace('?', -999999, inplace=True) classResult = pd2['Class'] #print("classResult", classResult) #split whole data into train part and test part X_train, X_test, y_train, y_test = train_test_split(pd2, classResult, test_size=0.2) #print(X_train, X_test) #generate kneighborsclassifier clf = KNeighborsClassifier() clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) print("accuracy ", accuracy)
#visualizig difference between sepal and petals pf particular data plt.xlabel('Sepal Length') plt.ylabel('Sepal Width') plt.scatter(pd0['sepal length (cm)'], pd0['sepal width (cm)'],color="green",marker='+') plt.scatter(pd1['sepal length (cm)'], pd1['sepal width (cm)'],color="blue",marker='.') plt.show() plt.xlabel('Petal Length') plt.ylabel('Petal Width') plt.scatter(pd0['petal length (cm)'], pd0['petal width (cm)'],color="green",marker='+') plt.scatter(pd1['petal length (cm)'], pd1['petal width (cm)'],color="blue",marker='.') plt.show() x=pd.drop(["target","flower_name"],axis="columns") y=pd["target"] from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0) from sklearn.svm import SVC model=SVC() model.fit(x_train,y_train) print(model.score(x_test,y_test)) y_pred=model.predict(x_test)
# best used on specific col, ideally col representing categorical data # doesn't require any parameter # puts into descending order of frequency # turns into series object, no DataFrame df.value_counts() # categorical data refers to statistical data type of categorical var df['column'].value_counts() # when sorting multiple features -> place features in a list filtered_df.sort_values(by=['column_1', 'column_2'], ascending=False) # day14_combining_knowledge # pandas filter, sort_values(), pandas groupby() # drop rows or columns by pd.drop() # drop column labelled 'Unnamed: 0' pd.drop(columns=['Unnamed: 0']) # day15_data_visualization data = {'item_1': 40, 'item_2': 50, 'item_3': 25} items = list(data.keys()) quantity = list(data.values()) plt.figure() # frame, start plot with a figure plt.bar( x=items, height=quantity ) # body, declaring specific bar plot statement, can use DataFrame columns as x and height plt.title('example bar plot') # stylistic features, adding title plt.show() # show plot
import pandas as pd pd.set_option('display.max_columns', None) data = pd.read_csv('growth.csv') data = pd.drop("Indicator Name", axis=1) data = pd.drop("Indicator Code", axis=1) print(data.head())
import pandas as base import numpy as np from sklearn.neighbors import KNeighborsClassifier base = base.read_csv("C://Users//leona//Desktop//iris.csv") x=np.array(base.drop('target',1)) y=np.array(base.target) knn=KNeighborsClassifier(n_neighbors=3) knn.fit(x,y) print("Digite os seguintes parâmetros: Altura da Sépala,Largura da Sépala,Altura da pétala e Largura da pétala") print() alturaSepala=float(input()) larguradaSepala=float(input()) alturaPetala=float(input()) larguradaPetala=float(input()) print("Altura da Sépala:",alturaSepala) print("Largura da Sépala:",larguradaSepala) print("Altura da Pétala:",alturaPetala) print("Largura da Pétala:",larguradaPetala) aux=knn.predict([[alturaSepala,larguradaSepala,alturaPetala,larguradaPetala]]) print("A flor provavelmente é da classe:",aux[0]) alturaSepala=float(input()) print(knn.predict([[6.5,6.5,4.7,1.3]]))
import pandas as pd data = pd.read_csv("") # 处理数据 # 缩小数据,查询数据删选 data = data.query("x > 1.0 & x < 1.25 & y >1.5 & y < 1.75") # 处理时间 time_value = pd.to_datetime(data["time"] ,unit="s") time = pd.DatetimeIndex(time_value) data["hour"] = time.hour # 在pd中axis = 0代表行 ,axis=1 代表列 # 在sklean中axis = 1代表行,axis=0代表列 data = pd.drop("time" ,axis =1) #把签到的数量少于那个的目标位置删除 place_count = data.groupby("place_id").count() tf = place_count[place_count.row_id > 3].reset_index() data = data[data["place_id"].isin(tf.place_id)] # 取出特征值和目标值 y = data["place_id"] x = pd.drop("place_id" ,axis = 1) # 进行数据的分割 x_train,x_test ,y_train ,y_test = train_test_split(x,y ,test_size=0.25) # 标椎化 std = StandardScaler() x_train = std.fit_transform(x_train) x_test = std.transform(x_test)