Ejemplo n.º 1
0
def create_catalogue_section():
	config.log_message("Converting the catalogue to JSON...")
	pd = read_sheet('catalogue',{'key':str,'value':str})
	pd=pd.dropna()

	result = pd.to_dict(orient="records")
	supported_values=['title','description','creator','contactPoint','license','versionInfo','keyword','identifier','rights','publisher_name','publisher_url']
	catalogue_dict={}
	publisher={}
	for row in result:
		if row['key'] in supported_values:
			# Those that need more work to get into required structure
			if row['key']=='publisher_name':
				publisher['name']=row['value']
			elif row['key']=='publisher_url':
				publisher['url']=row['value']	
			elif row['key']=='keyword':
				catalogue_dict['keyword']=row['value'].split(",")
			else:
				catalogue_dict[row['key']]=row['value']

	catalogue_dict["publisher"]=publisher	
	config.log_message("Done!")

	return catalogue_dict	
Ejemplo n.º 2
0
def get_returnSeason(x, Year):
    #
    basicdata = ts.get_stock_basics()
    basicdata = basicdata.loc[x, ['name', 'outstanding']]
    basicdata['code'] = basicdata.index
    #profitdata=get_profit_report['code','name','roe']
    returnSeasonly = np.zeros(4)

    for i in range(4):
        pd = ts.get_profit_data(Year, i + 1)
        while pd is None:
            pd = ts.get_profit_data(Year, i + 1)

        pd = pd.loc[:, ['code', 'roe']]
        pd = pd.dropna()

        #pd.index =pd[pd.code<>np.nan]
        pd.index = pd['code']
        Return = pd.loc[x, ['code', 'roe']]
        if len(Return) == 0:
            print('空数据,不处理 ' + str(Year) + str(i))
            returnSeasonly[i] = 0
            break
        Return['weight'] = basicdata['outstanding'] / basicdata[
            'outstanding'].sum()
        Return['Wreturn'] = Return['weight'] * Return['roe']
        returnSeasonly[i] = Return['Wreturn'].sum()
        if i > 0:
            returnSeasonly[i] = returnSeasonly[i] - returnSeasonly[i - 1]
        print('已处理 ' + str(Year) + str(i))
    return returnSeasonly
Ejemplo n.º 3
0
 def drop_na_col(self, pd, rows, ratio):
     '''
     欠損値が所定の個数含まれる列を削除する関数
         @param  pd      評価対象データセット
         @param  rows    削除基準となる欠損値の数
         @param  ratio   削除基準の何割をボーダーラインにするかの係数
                         rows * ratioでボーダーラインとなる欠損値の個数を定義する
     '''
     barrier = rows * ratio
     return pd.dropna(thresh=barrier, axis=1)
Ejemplo n.º 4
0
def create_configuration_section():
	config.log_message("Converting the configuration to JSON...")
	pd = read_sheet('configuration',{'key':str,'value':str})
	pd=pd.dropna()

	result = pd.to_dict(orient="records")
	supported_values=['visibility','workflow_key','code']
	configuration_dict={}
	for row in result:
		if row['key'] in supported_values:
			configuration_dict[row['key']]=row['value']

	config.log_message("Done!")

	return configuration_dict
Ejemplo n.º 5
0
#For pd objects eg data.xxx

data.shape
data.label.value_counts()
data.label.nunique()
data.label.describe() # like summary in R
sample.plot.scatter(x='feat1',y='feat2')
data.plot(kind='bar')


# pd functions eg pd.xxx

pd.Categorical(data.Label).codes # to put categorical to numeric
pd.unique(data.labels)
pd.dropna()
pd.fillna()

# plot
data.boxplot(column = 'finish',by = 'material')

### Preprocessing-------------------------------------------------------------
#Scale
sklp.minmax_scale(data,(0,1)) # data must be numerical pd or np
standardized_Dataset = sklp.scale(Dataset, axis=0)
Normalized_Dataset = sklp.normalize(Dataset, norm='l2')
binarized_Dataset = skp.binarize(Dataset,threshold=0.0)

# Missing data

imp = sklp.Imputer(missing_values=0,strategy='mean',axis=0)
Ejemplo n.º 6
0
 def __repr__(self):
     # returns the dataframe, using dropna() to remove any null values.
     # Null values usually get added to the class as a glitch.
     return pd.dropna(self.df)
Ejemplo n.º 7
0
# Performing data wrangling or feature selection
features = [
    'SNO', 'CID', 'firstname', 'lastname', 'fullname', 'Gender', 'EMAILID',
    'MobileNo', 'DOB', 'National', 'Colname', 'University', 'GPAX', 'XYEAR',
    'GPAXII', 'XIIYEAR', 'Current', 'Degree', 'Specialization', 'UGGPA'
]

# In[33]:

pd = data[features]

# In[34]:

#Removing Null values
pd = pd.dropna()

# In[41]:

# Selecting the x features(10 grade and 12 grade gpa) for regression
x_features = ['GPAXII', 'GPAX']

# In[42]:

x = pd[x_features]

# In[45]:

# Selecting the y features(undergraduate gpa) for regression
y_features = ['UGGPA']
Ejemplo n.º 8
0
df2.shape
df2
df2 += 0.015 * np.random.randn(len(df2), 2)
df2
df2.plot()
plt.show()
plt.show(block=Flase)
plt.show(block=False)
df1.plot()
plt.show(block=Flase)
plt.show(block=False)
df2.plot()
plt.show(block=False)
df2.index[0]
time_shift(df1['y'].values, df2['y'].values)
pd.dropna(df2['y'].values)
%paste
time_shift(df1['y'].values, df2['y'].values)
np.hanning
%paste
time_shift(df1['y'].values, df2['y'].values)
plt.plot(df1['y'].values)
plt.show(block=False)
plt.plot(df1['y'].values)
plt.show(block=False)
plt.plot(df2['y'].values)
plt.show(block=False)
delta = 2027 - 815
s1 = df1['y'].values
s2 = df2['y'].values
xcorr = smooth(np.correlate(s1, s2), window_len=min(11, int(0.15 * len(s1))))
Ejemplo n.º 9
0
# print(t1)
# index列索引,columns行索引

# 前两行
# print(t1.head(1))
# 显示后两行
# print(t1.tail(3))
# print(t1.info())
# print(t1.describe())
""" 缺失数据处理 """
# 是否为nan
# pd.isnull(df)
# pd.notnull(df)
# 删除
# axis,删除行,how:all是全部是nan才删,any有一个nan,才删,inplace:是否进行原地修改
pd.dropna(axis=0, how='any', inplace=False)
# 填充数据
t.fillna(t.mean(), t.fiallna(t.median()), t.fillna(0))

# np.nan

# t1.join(t2)
# 确实全部按NaN处理

# 默认情况,内联
# inner,并集, outer,交集, left左边为准,right右边为准NaN补全
# t1.merge(t3,on='a',how='inner')
# t1.merge(t3,left_on='a', right_on="x")
""" 分组和聚合 """
grouped = df.groupby(by='Country')
""" 索引 """