def create_catalogue_section(): config.log_message("Converting the catalogue to JSON...") pd = read_sheet('catalogue',{'key':str,'value':str}) pd=pd.dropna() result = pd.to_dict(orient="records") supported_values=['title','description','creator','contactPoint','license','versionInfo','keyword','identifier','rights','publisher_name','publisher_url'] catalogue_dict={} publisher={} for row in result: if row['key'] in supported_values: # Those that need more work to get into required structure if row['key']=='publisher_name': publisher['name']=row['value'] elif row['key']=='publisher_url': publisher['url']=row['value'] elif row['key']=='keyword': catalogue_dict['keyword']=row['value'].split(",") else: catalogue_dict[row['key']]=row['value'] catalogue_dict["publisher"]=publisher config.log_message("Done!") return catalogue_dict
def get_returnSeason(x, Year): # basicdata = ts.get_stock_basics() basicdata = basicdata.loc[x, ['name', 'outstanding']] basicdata['code'] = basicdata.index #profitdata=get_profit_report['code','name','roe'] returnSeasonly = np.zeros(4) for i in range(4): pd = ts.get_profit_data(Year, i + 1) while pd is None: pd = ts.get_profit_data(Year, i + 1) pd = pd.loc[:, ['code', 'roe']] pd = pd.dropna() #pd.index =pd[pd.code<>np.nan] pd.index = pd['code'] Return = pd.loc[x, ['code', 'roe']] if len(Return) == 0: print('空数据,不处理 ' + str(Year) + str(i)) returnSeasonly[i] = 0 break Return['weight'] = basicdata['outstanding'] / basicdata[ 'outstanding'].sum() Return['Wreturn'] = Return['weight'] * Return['roe'] returnSeasonly[i] = Return['Wreturn'].sum() if i > 0: returnSeasonly[i] = returnSeasonly[i] - returnSeasonly[i - 1] print('已处理 ' + str(Year) + str(i)) return returnSeasonly
def drop_na_col(self, pd, rows, ratio): ''' 欠損値が所定の個数含まれる列を削除する関数 @param pd 評価対象データセット @param rows 削除基準となる欠損値の数 @param ratio 削除基準の何割をボーダーラインにするかの係数 rows * ratioでボーダーラインとなる欠損値の個数を定義する ''' barrier = rows * ratio return pd.dropna(thresh=barrier, axis=1)
def create_configuration_section(): config.log_message("Converting the configuration to JSON...") pd = read_sheet('configuration',{'key':str,'value':str}) pd=pd.dropna() result = pd.to_dict(orient="records") supported_values=['visibility','workflow_key','code'] configuration_dict={} for row in result: if row['key'] in supported_values: configuration_dict[row['key']]=row['value'] config.log_message("Done!") return configuration_dict
#For pd objects eg data.xxx data.shape data.label.value_counts() data.label.nunique() data.label.describe() # like summary in R sample.plot.scatter(x='feat1',y='feat2') data.plot(kind='bar') # pd functions eg pd.xxx pd.Categorical(data.Label).codes # to put categorical to numeric pd.unique(data.labels) pd.dropna() pd.fillna() # plot data.boxplot(column = 'finish',by = 'material') ### Preprocessing------------------------------------------------------------- #Scale sklp.minmax_scale(data,(0,1)) # data must be numerical pd or np standardized_Dataset = sklp.scale(Dataset, axis=0) Normalized_Dataset = sklp.normalize(Dataset, norm='l2') binarized_Dataset = skp.binarize(Dataset,threshold=0.0) # Missing data imp = sklp.Imputer(missing_values=0,strategy='mean',axis=0)
def __repr__(self): # returns the dataframe, using dropna() to remove any null values. # Null values usually get added to the class as a glitch. return pd.dropna(self.df)
# Performing data wrangling or feature selection features = [ 'SNO', 'CID', 'firstname', 'lastname', 'fullname', 'Gender', 'EMAILID', 'MobileNo', 'DOB', 'National', 'Colname', 'University', 'GPAX', 'XYEAR', 'GPAXII', 'XIIYEAR', 'Current', 'Degree', 'Specialization', 'UGGPA' ] # In[33]: pd = data[features] # In[34]: #Removing Null values pd = pd.dropna() # In[41]: # Selecting the x features(10 grade and 12 grade gpa) for regression x_features = ['GPAXII', 'GPAX'] # In[42]: x = pd[x_features] # In[45]: # Selecting the y features(undergraduate gpa) for regression y_features = ['UGGPA']
df2.shape df2 df2 += 0.015 * np.random.randn(len(df2), 2) df2 df2.plot() plt.show() plt.show(block=Flase) plt.show(block=False) df1.plot() plt.show(block=Flase) plt.show(block=False) df2.plot() plt.show(block=False) df2.index[0] time_shift(df1['y'].values, df2['y'].values) pd.dropna(df2['y'].values) %paste time_shift(df1['y'].values, df2['y'].values) np.hanning %paste time_shift(df1['y'].values, df2['y'].values) plt.plot(df1['y'].values) plt.show(block=False) plt.plot(df1['y'].values) plt.show(block=False) plt.plot(df2['y'].values) plt.show(block=False) delta = 2027 - 815 s1 = df1['y'].values s2 = df2['y'].values xcorr = smooth(np.correlate(s1, s2), window_len=min(11, int(0.15 * len(s1))))
# print(t1) # index列索引,columns行索引 # 前两行 # print(t1.head(1)) # 显示后两行 # print(t1.tail(3)) # print(t1.info()) # print(t1.describe()) """ 缺失数据处理 """ # 是否为nan # pd.isnull(df) # pd.notnull(df) # 删除 # axis,删除行,how:all是全部是nan才删,any有一个nan,才删,inplace:是否进行原地修改 pd.dropna(axis=0, how='any', inplace=False) # 填充数据 t.fillna(t.mean(), t.fiallna(t.median()), t.fillna(0)) # np.nan # t1.join(t2) # 确实全部按NaN处理 # 默认情况,内联 # inner,并集, outer,交集, left左边为准,right右边为准NaN补全 # t1.merge(t3,on='a',how='inner') # t1.merge(t3,left_on='a', right_on="x") """ 分组和聚合 """ grouped = df.groupby(by='Country') """ 索引 """