Example #1
0
 def drop_minus_row(self, pd, target):
     '''
     マイナスの値をとる行を削除する関数
         @param  pd      評価対象データセット
         @param  target  評価対象列(この列のマイナス値が含まれる行を削除する)
     '''
     return pd.drop(pd[pd[target] <= 0].index)
     self.cust.drop(self.cust[self.cust['生年月日'].str.match('\*',
                                                          na=False)].index)
def normalize_df(df):
    #df=pd.DataFrame(df)
    #
    df["short_term_mean"] = df.number_of_times.rolling(6).mean()
    where_outliers_be = np.abs(df.number_of_times -
                               df.number_of_times.mean()) >= (
                                   3 * df.number_of_times.std())

    df.number_of_times[where_outliers_be] = df.short_term_mean[
        where_outliers_be]

    normalized_df = (df - df.mean()) / (df.max() - df.min())
    normalized_df = pd.drop(normalized_df.short_term_mean, 1)
    return normalized_df
Example #3
0
    def drop_irrelevant_y_var(self, pd, price_exist):
        columns_to_drop = list()
        for i in range(len(self.input_tech_indicators_list)):
            columns_to_drop.append(self.input_tech_indicators_list[i].upper() + "(t)")

        for i in range(len(self.input_tech_indicators_list)):
            for j in range(1, self.n_seq):
                columns_to_drop.append(self.input_tech_indicators_list[i].upper() + "(t+%d)" % j)

        if not price_exist:
            for i in range(self.n_lag):
                columns_to_drop.append("Share Price(t-%d)" % (i+1))

        #print("columns_to_drop", columns_to_drop)
        return pd.drop(columns_to_drop, axis=1)
Example #4
0
df = pd.read_csv(infile_dt, sep='  ', engine='python',  usecols=[0,1,2,3,4], 
                header=None, skiprows=[0], names=colnames, index_col=[0]) 
df
# ==================



# === data analysis ===
# --- Compute 3D DZU with partial bottom cell corrected, copied from Steve's github ---
# (https://github.com/sgyeager/POP_MOC/blob/main/pop_moc_0p1deg.py) 
# and commented a few lines. Need a POP history file for 3D DZU
in_file = infile_uvel

ds = xr.open_dataset(in_file)
pd     = ds['PD']
pd=pd.drop(['ULAT','ULONG'])            # this is a python bug that we are correcting
temp   = ds['TEMP']
temp=temp.drop(['ULAT','ULONG'])
# salt   = ds['SALT']
# salt=salt.drop(['ULAT','ULONG'])
# u_e   = ds['UVEL']/100
# u_e=u_e.drop(['TLAT','TLONG'])
# u_e.attrs['units']='m/s'
# v_e   = ds['VVEL']/100
# v_e=v_e.drop(['TLAT','TLONG'])
# v_e.attrs['units']='m/s'
# w_e   = ds['WVEL']/100
# w_e=w_e.drop(['ULAT','ULONG'])
# w_e.attrs['units']='m/s'
ulat   = ds['ULAT']
ulon   = ds['ULONG']
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

pd=pd.read_csv("train_label.csv")
constant_filter = VarianceThreshold(threshold=0.05)
constant_filter.fit(pd)
len(pd.columns[constant_filter.get_support()])


constant_columns = [column for column in pd.columns
                    if column not in pd.columns[constant_filter.get_support()]]

pd.drop(labels=constant_columns, axis=1, inplace=True)
print(constant_columns)
pd.to_csv("train_label_p.csv",index=False)
Example #6
0
 def _divide_pd(self, pd):
     sample = np.random.choice(
         a=pd.index,
         size=int(len(pd)*cfg.params['train']),
         replace=False)
     return pd.ix[sample], pd.drop(sample)
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier

#process the data
columns_name = ["Sample code number", "Clump Thickness", "Uniformity of Cell Size",
                "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size",
                "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class"]
pd = pd.read_csv('breast-cancer-wisconsin.data.txt', header= None, names= columns_name)

pd2 = pd.drop('Sample code number', axis=1)
pd2.replace('?', -999999, inplace=True)

classResult = pd2['Class']
#print("classResult", classResult)

#split whole data into train part and test part
X_train, X_test, y_train, y_test = train_test_split(pd2, classResult, test_size=0.2)
#print(X_train, X_test)

#generate kneighborsclassifier
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

print("accuracy ", accuracy)




#visualizig difference between sepal and petals pf particular data
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.scatter(pd0['sepal length (cm)'], pd0['sepal width (cm)'],color="green",marker='+')
plt.scatter(pd1['sepal length (cm)'], pd1['sepal width (cm)'],color="blue",marker='.')
plt.show()

plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
plt.scatter(pd0['petal length (cm)'], pd0['petal width (cm)'],color="green",marker='+')
plt.scatter(pd1['petal length (cm)'], pd1['petal width (cm)'],color="blue",marker='.')
plt.show()



x=pd.drop(["target","flower_name"],axis="columns")
y=pd["target"]

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

from sklearn.svm import SVC

model=SVC()
model.fit(x_train,y_train)

print(model.score(x_test,y_test))

y_pred=model.predict(x_test)
Example #9
0
# best used on specific col, ideally col representing categorical data
# doesn't require any parameter
# puts into descending order of frequency
# turns into series object, no DataFrame
df.value_counts()
# categorical data refers to statistical data type of categorical var
df['column'].value_counts()

# when sorting multiple features -> place features in a list
filtered_df.sort_values(by=['column_1', 'column_2'], ascending=False)

# day14_combining_knowledge
# pandas filter, sort_values(), pandas groupby()

# drop rows or columns by
pd.drop()
# drop column labelled 'Unnamed: 0'
pd.drop(columns=['Unnamed: 0'])

# day15_data_visualization
data = {'item_1': 40, 'item_2': 50, 'item_3': 25}
items = list(data.keys())
quantity = list(data.values())

plt.figure()  # frame, start plot with a figure
plt.bar(
    x=items, height=quantity
)  # body, declaring specific bar plot statement, can use DataFrame columns as x and height
plt.title('example bar plot')  # stylistic features, adding title
plt.show()  # show plot
Example #10
0
import pandas as pd
pd.set_option('display.max_columns', None)
data = pd.read_csv('growth.csv')
data = pd.drop("Indicator Name", axis=1)
data = pd.drop("Indicator Code", axis=1)
print(data.head())
Example #11
0
import pandas as base
import numpy as np
from  sklearn.neighbors import KNeighborsClassifier
base = base.read_csv("C://Users//leona//Desktop//iris.csv")
x=np.array(base.drop('target',1))
y=np.array(base.target)
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(x,y)
print("Digite os seguintes parâmetros: Altura da Sépala,Largura da Sépala,Altura da pétala e Largura da pétala")
print()
alturaSepala=float(input())
larguradaSepala=float(input())
alturaPetala=float(input())
larguradaPetala=float(input())
print("Altura da Sépala:",alturaSepala)
print("Largura da Sépala:",larguradaSepala)
print("Altura da Pétala:",alturaPetala)
print("Largura da Pétala:",larguradaPetala)
aux=knn.predict([[alturaSepala,larguradaSepala,alturaPetala,larguradaPetala]])
print("A flor provavelmente é da classe:",aux[0])

alturaSepala=float(input())
print(knn.predict([[6.5,6.5,4.7,1.3]]))
Example #12
0
import pandas as pd

data = pd.read_csv("")

# 处理数据
# 缩小数据,查询数据删选
data = data.query("x > 1.0 & x < 1.25 & y >1.5 & y < 1.75")

# 处理时间
time_value = pd.to_datetime(data["time"] ,unit="s")
time = pd.DatetimeIndex(time_value)

data["hour"] = time.hour
# 在pd中axis = 0代表行 ,axis=1 代表列
# 在sklean中axis = 1代表行,axis=0代表列
data = pd.drop("time" ,axis =1)

#把签到的数量少于那个的目标位置删除
place_count = data.groupby("place_id").count()
tf = place_count[place_count.row_id > 3].reset_index()
data = data[data["place_id"].isin(tf.place_id)]
# 取出特征值和目标值
y = data["place_id"]
x = pd.drop("place_id" ,axis = 1)

# 进行数据的分割
x_train,x_test ,y_train ,y_test = train_test_split(x,y ,test_size=0.25)
# 标椎化
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)