def __diff(data: pd.DataFrame) -> pd.DataFrame: """ Differentiate columns. The first row gets lost. """ data['dmasl'] = data['masl'].diff() data['dlat'] = data['lat'].diff() data['dlon'] = data['lon'].diff() return data.tail(-1)
def gonzales(data , k): #transform the data numpy array to data frame using the id as index points_list = DataFrame(data[:, 1:] , index = data[ : , 0]) #adding two columns in the points data frame for saving the centers and distance points_list["distance"] = np.nan points_list["center"] = np.nan distance_column_index = points_list.columns.get_loc("distance") #choosing a random point as the first center #center0 = points_list.sample(n=1 , random_state = randint(0,100) , axis=0) center0 = points_list.head(1) centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1)) centers_list['color'] = 'r' colors = "bgcmykw" #=========================================================================== # print(centers_list) # print("==============Initialization finished===========") #=========================================================================== #looping k-1 time to have k centers for k_cycle in range(1,k+1): # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster max_distance = 0 next_cluster = np.nan #loop on all the points to assign them to their closest center for indexp, p in points_list.iterrows(): #variables to save the choose the closest center min_cluster_distance = math.inf closest_cluster = None for indexc, center in centers_list.iterrows(): dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1])) if dis < min_cluster_distance: min_cluster_distance = dis closest_cluster = indexc p["distance"] = min_cluster_distance p["center"] = closest_cluster if min_cluster_distance > max_distance: max_distance = min_cluster_distance next_cluster = indexp centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index ]) centers_list.set_value(next_cluster, 'color', colors[k_cycle]) #======================================================================= # print(centers_list) # print("==============Cycle finished===========") #======================================================================= centers_list.drop(centers_list.tail(1).index, inplace=True) centers_list.drop(['color'], axis=1 ,inplace=True) #=========================================================================== # centers_list.plot(kind='scatter', x=0, y=1 , c='r' ) # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2 ) # plt.show() #=========================================================================== #print(points_list) return centers_list.as_matrix(columns=[0 ,1])
avg_site = np.array([]) for i in site: avg_site = np.append(avg_site, np.mean(train["click"][train["site_category"] == i])) # In[14]: site_df = DataFrame({"site": site, "avg_click": avg_site}) site_df = site_df.sort(columns="avg_click") plt.plot(range(len(site_df)), site_df["avg_click"], "bo", range(len(site_df)), site_df["avg_click"], "k") # In[17]: site_df.tail(2) # In[21]: t1["special_site"] = 0 t2["special_site"] = 0 test["special_site"] = 0 t1["special_site"][t1["site_category"] == "dedf689d"] = 1 t2["special_site"][t2["site_category"] == "dedf689d"] = 1 test["special_site"][test["site_category"] == "dedf689d"] = 1 # In[22]: print sum(t1["special_site"]), sum(t2["special_site"]), sum(test["special_site"])
def tail_view(data: pd.DataFrame) -> pd.DataFrame: return data.tail()
df.to_csv('births1880.txt', index=False, header=False) Location = r'births1880.txt' df = read_csv(Location) print df print df.head() df = read_csv(Location, header=None) print df print df.tail() df = read_csv(Location, names=['Names', 'Births']) print df.head() import os os.remove(Location) print df['Names'].unique() for x in df['Names'].unique(): print x print df['Names'].describe()
ddd["retMsg"] data = ddd["data"] len(data) data[0] data[300] ###################################################### orderbook_json_file = "test1.json" fp = open(orderbook_json_file, 'r') json_string = fp.readlines()[0] fp.close() orderbook_json_dict = json.loads(json_string.decode('unicode_escape')) data_list = orderbook_json_dict["data"] data_list = data_list[300:] dataSet = DataFrame(data_list) # dataSet.head(10) dataSet.tail(10) ###################################################### ###################################################### ###################################################### ###################################################### ###################################################### ###################################################### ###################################################### ###################################################### ###################################################### ###################################################### ###################################################### ###################################################### ###################################################### ###################################################### ######################################################
def test_head_tail_empty(): # test empty dataframe empty_df = DataFrame() tm.assert_frame_equal(empty_df.tail(), empty_df) tm.assert_frame_equal(empty_df.head(), empty_df)
dictSeries=Series(myDict) dictSeries #Creating a data frame from dictionary empDict={'id':[1,2,3,4],'name': ['Mark','Ian','Sam','Rich'],'isManager':[False,True,False,True]} ## Data Structure : Data Frame from a dictionary empDict={'id':[1,2,3,4]} empDf=DataFrame(empDict) #Access rows and columns empDf.name empDf.name[2] empDf[empDf.isManager == False] empDf.head() empDf.tail() empDf.iloc[2,] #Create new column empDf.append(Series([5,False,'Derek',2], index=['id','isManager','name','deptId'], ignore_index=True) empDf #Deleting a column empDf['dummy']=1 empDf del empDf['dummy'] empDf #Deleting a row
# Note: .iloc is strictly integer position based df.iloc[0:3] # We can also select using the column name df['Rev'] df[['Rev','test']] # df['ColumnName'][inclusive:exclusive] df['Rev'][0:3] df['col'][5:] df[['col', 'test']][:3] ''' There is also a handy function to select the top and bottom records of a dataframe ''' # Select top N number of records (default = 5) df.head() # Select bottom N number of records (default = 5) df.tail()
yield path, message def dataFrameFromDirectory(path, classification): rows = [] index = [] for filename, message in readFiles(path): rows.append({'message': message, 'class': classification}) index.append(filename) return DataFrame(rows, index=index) #An empty dataframe with 'message' and 'class' headers data = DataFrame({'message': [], 'class': []}) #Including the email details with the spam/ham classification in the dataframe data = data.append( dataFrameFromDirectory( 'F:/UTD/Machine Learning/Dataset1/hw2_train/train/ham', 'ham')) data = data.append( dataFrameFromDirectory( 'F:/UTD/Machine Learning/Dataset1/hw2_train/train/spam', 'spam')) #Head and the Tail of 'data' data.head() print(data.tail()) vectoriser = CountVectorizer() count = vectoriser.fit_transform(data['message'].values) print(count)
print("=" * 50) #printing columns name print(city_frame.columns) #prints name of index of column print("=" * 50) #printing a index column print(city_frame["Population"]) print("=" * 50) #printing first 3 row print(city_frame.head(3)) print("=" * 50) #printing last two row print(city_frame.tail(2)) print("=" * 50) #printing row of a data frame print(city_frame.ix[2]) #print third row print("=" * 50) #adding new column #city_frame["Stadium"]=np.array(["Manchester","Liverpool","Chelsea"]) city_frame["Stadium"] = ["Manchester", "Liverpool", "Chelsea"] #note both passing list of values and array works print(city_frame) print("=" * 50) import webbrowser website = "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html"
class Technical(StockGet): def __init__(self, stocknum): #print("analyze0") super().__init__(stocknum) #print("analyze1") self.vol_get(self.db_s) #Volatirity calculation self.getsymbol(self.db_s) #日足情報 self.week_getsymbol(self.db_sw) #週足情報 #print("analyze2") self.macd(self.db_s) self.smaget(self.db_s) #print("analyze3") self.getbps() #print("analyze4") self.getper(self.db_s) self.smacross(self.db_s) self.bolinger(self.db_s) self.smaweekget(self.db_sw) self.kairi(self.db_s) self.combine() #print("self.db_sw") #print(self.db_sw) def vol_get(self, mat): #銘柄のVolatirityを取得する self.stock_vol = 100 * mat.pct_change() self.stock_vol = self.stock_vol.dropna() self.volatirity = np.std(self.stock_vol) #過去1年の株価のボラティリティーの標準偏差計算 self.mean_vol = np.mean(self.stock_vol) #過去1年の株価のボラティリティーの平均計算抽出 self.stockvolmat = DataFrame(self.stock_vol, columns=[ 'Stock', '+sigma', '-sigma', '+2sigma', '-2sigma', '+3sigma', '-3sigma' ]) #print("stockvolmat") #print(len(stockvolmat)) self.stockvolmat['+sigma'] = np.full(len(self.stockvolmat), self.mean_vol + self.volatirity) self.stockvolmat['+2sigma'] = np.full( len(self.stockvolmat), self.mean_vol + 2 * self.volatirity) self.stockvolmat['+3sigma'] = np.full( len(self.stockvolmat), self.mean_vol + 3 * self.volatirity) self.stockvolmat['-sigma'] = np.full(len(self.stockvolmat), self.mean_vol - self.volatirity) self.stockvolmat['-2sigma'] = np.full( len(self.stockvolmat), self.mean_vol - 2 * self.volatirity) self.stockvolmat['-3sigma'] = np.full( len(self.stockvolmat), self.mean_vol - 3 * self.volatirity) #self.vol = [self.stock_vol, self.volatirity] #print(stockvolmat) #return stockvolmat def macd(self, mat): #MACDデータを取得する self.s_dur = setting.S_DUR #MACD parameter self.l_dur = setting.L_DUR #MACD parameter self.sig_dur = setting.SIG_DUR #MACD parameter mat = DataFrame(mat.dropna()) self.dmatmacd = DataFrame(mat, columns=[ 'Stock', 'LONG', 'SHORT', 'V_MACD', 'V_SIG', 'DIF_MACD', 'ZERO' ]) self.dmatmacd['ZERO'] = np.full(self.dmatmacd.shape[0], 0) self.dmatmacd['LONG'] = self.dmatmacd['Stock'].ewm( span=self.l_dur).mean() self.dmatmacd['SHORT'] = self.dmatmacd['Stock'].ewm( span=self.s_dur).mean() self.dmatmacd[ 'V_MACD'] = self.dmatmacd['SHORT'] - self.dmatmacd['LONG'] self.dmatmacd['V_SIG'] = self.dmatmacd['V_MACD'].rolling( window=self.sig_dur, center=False).mean() self.dmatmacd[ 'DIF_MACD'] = self.dmatmacd['V_MACD'] - self.dmatmacd['V_SIG'] #return self.dmatmacd #dmat['LONG'] = pd.ewma(dmat['Stock'], span=self.l_dur) #dmat['SHORT'] = pd.ewma(dmat['Stock'], span=self.s_dur) #dmat['V_MACD'] = dmat['SHORT'] - dmat['LONG'] #dmat['V_SIG'] = pd.rolling_mean(MA['V_MACD'], self.sig_dur) #dmat['DIF_MACD'] = dmat['V_MACD'] - dmat['V_SIG'] def smaget(self, mat): #SMAデータを取得する。 self.s_idou = setting.S_IDOU self.m_idou = setting.M_IDOU self.l_idou = setting.L_IDOU mat = DataFrame(mat.dropna()) self.dmatsma = DataFrame(mat, columns=['Stock', 'MA_S', 'MA_M', 'MA_L']) self.dmatsma['MA_S'] = self.dmatsma['Stock'].rolling( window=self.s_idou, center=False).mean() self.dmatsma['MA_M'] = self.dmatsma['Stock'].rolling( window=self.m_idou, center=False).mean() self.dmatsma['MA_L'] = self.dmatsma['Stock'].rolling( window=self.l_idou, center=False).mean() self.dmat_2month = self.dmatsma['Stock'][ datetime.datetime(self.Y, self.M, self.D) - datetime.timedelta( days=60):datetime.datetime(self.Y, self.M, self.D)] self.stockmax = np.nanmax(self.dmat_2month) * 1.02 self.stockmin = np.nanmin(self.dmat_2month) * 0.98 #MA['MA_M'] = pd.rolling_mean(MA['Stock'], m_idou) #MA['MA_L'] = pd.rolling_mean(MA['Stock'], l_idou) self.stock_p1sig = round( self.stock_now * (math.e**(self.volatirity / 100)), 0) self.stock_p05sig = round( self.stock_now * (math.e**(0.5 * self.volatirity / 100)), 0) self.stock_n1sig = round( self.stock_now * (math.e**(-self.volatirity / 100)), 0) self.stock_n05sig = round( self.stock_now * (math.e**(-0.5 * self.volatirity / 100)), 0) #print("self.dmatsma['MA_S']") #print(self.dmatsma['MA_S']) self.smatoday = self.dmatsma['MA_S'].tail(1)[0] #print("self.smatoday") #print(self.smatoday) #self.stockinfo = [stock_p1sig["Stock"][0], stock_p05sig["Stock"][0], stock_n1sig["Stock"][0], stock_n05sig["Stock"][0]] #print("self.stockinfo") #print(self.stockinfo) return self.dmatsma, self.stockmax, self.stockmin def getsymbol(self, mat): mat = DataFrame(mat.dropna()) self.dmatstock = DataFrame(mat, columns=['Stock']) #print("self.dmatstock") #print(self.dmatstock) self.stock_old = self.dmatstock.head(1) # 1年前の株価を取得 self.stock_now = self.dmatstock.tail(1) # 本日の株価を取得する self.stock_exp = 100 * math.log( self.stock_now['Stock'][0] / self.stock_old['Stock'][0], math.e) #1年前の株価と本日の株価の変化率を計算 self.stock_kinou = self.dmatstock[len(self.dmatstock) - 2:len(self.dmatstock) - 1]['Stock'][0] #昨日の株価を取得する self.stock_ototoi = self.dmatstock[len(self.dmatstock) - 3:len(self.dmatstock) - 2]['Stock'][0] # おとといの株価を取得する self.dif_kinou_ototoi = self.stock_kinou - self.stock_ototoi # 昨日の株価 - おとといの株価 self.dif_today_kinou = self.stock_now - self.stock_kinou # 今日の株価 - 昨日の株価 #print("self.stock_kinou") #print(self.stock_kinou) #print("self.stock_ototoi") #print(self.stock_ototoi) #self.volatirity=np.std(self.stock_vol) #過去1年の株価のボラティリティーの標準偏差計算 #self.mean_vol=np.mean(self.stock_vol) #過去1年の株価のボラティリティーの平均計算抽出 #print(self.volatirity) #print(self.mean) def week_getsymbol(self, mat): mat = DataFrame(mat.dropna()) #print("week") #print(mat) self.dmatstock = DataFrame(mat, columns=['Stock']) self.stock_1weekmae = self.dmatstock[len(self.dmatstock) - 2:len(self.dmatstock) - 1]['Stock'][0] #週足(1週間前)の株価を取得する self.stock_2weekmae = self.dmatstock[len(self.dmatstock) - 3:len(self.dmatstock) - 2]['Stock'][0] #週足(2週間前)の株価を取得する self.stock_3weekmae = self.dmatstock[len(self.dmatstock) - 4:len(self.dmatstock) - 3]['Stock'][0] #週足(3週間前)の株価を取得する self.dif_3_2_week = self.stock_2weekmae - self.stock_3weekmae #週足(3週間前) - 週足(2週間前) を取得する self.dif_2_1_week = self.stock_1weekmae - self.stock_2weekmae #週足(3週間前) - 週足(2週間前) を取得する #print("self.dif_3_2_week") #print(self.dif_3_2_week) #print("self.dif_2_1_week") #print(self.dif_2_1_week ) def getbps(self): #指定銘柄のBPS(一株当たりの利益) #print("self.bps") self.record_stockdb = pd.read_csv( setting.CSV_DB_PATH, names=setting.CSV_DB_PATH_COLUMNS) #全登録銘柄の情報を読み込み #self.record_stockdb = pd.read_csv("/home/pi/Desktop/stock/sqltest.csv", names=setting.CSV_DB_PATH_COLUMNS) #全登録銘柄の情報を読み込み #print("record_stockdb") #print(self.record_stockdb) #print("record_stockdb end") self.getinfo_of_stock = DataFrame( self.record_stockdb.ix[self.record_stockdb['STOCK_NUM'] == int( self.stocknum)]) #対象銘柄1行を抽出 #print("getinfo_of_stock") #print(self.getinfo_of_stock) #print("getinfo_of_stock end") self.getinfo_of_stock = self.getinfo_of_stock.set_index( 'STOCK_NUM') # STOCK_NUM列をindexに指定した #print("1") self.stockinfo = DataFrame( self.getinfo_of_stock['LASTYEAR_PROFIT_PER_STOCK'] ) # 'LASTYEAR_PROFIT_PER_STOCK'列のみを抽出 #print("self.stockinfo") #print(self.stockinfo) self.bps = self.stockinfo['LASTYEAR_PROFIT_PER_STOCK'][int( self.stocknum )] # インデックス stockget_numの'LASTYEAR_PROFIT_PER_STOCK'列を取り出す #print("self.bps") #print(self.bps) #print(hen) def getper(self, mat): #指定銘柄のPER推移を計算する。 if self.bps == 0: print("SKIP of PER_Dict") self.per_now = 0 self.per_max = 1 self.per_min = 0 #return round(per_now, 1), round(per_max, 1) else: mat = DataFrame(mat.dropna()) self.maper = DataFrame(mat, columns=[ 'Stock', 'PER', 'PER_MIN', 'PER1', 'PER2', 'PER3', 'PER4', 'PER_MAX' ]) self.maper['PER'] = self.maper['Stock'] / float( self.bps) # 取得株価についてPER計算実施 # print('MA') # print(MA) self.maper['PER_MAX'] = np.nanmax(self.maper['PER']) self.maper['PER_MIN'] = np.nanmin(self.maper['PER']) self.maper['PER1'] = self.maper['PER_MIN'] + ( self.maper['PER_MAX'] - self.maper['PER_MIN']) / 5 self.maper['PER2'] = self.maper['PER_MIN'] + 2 * ( self.maper['PER_MAX'] - self.maper['PER_MIN']) / 5 self.maper['PER3'] = self.maper['PER_MIN'] + 3 * ( self.maper['PER_MAX'] - self.maper['PER_MIN']) / 5 self.maper['PER4'] = self.maper['PER_MIN'] + 4 * ( self.maper['PER_MAX'] - self.maper['PER_MIN']) / 5 self.per_max = self.maper['PER_MAX'][0] self.per1 = self.maper['PER1'][0] self.per2 = self.maper['PER2'][0] self.per3 = self.maper['PER3'][0] self.per4 = self.maper['PER4'][0] self.per_min = self.maper['PER_MIN'][0] per_now_mat = DataFrame(self.maper['PER']) per_now = per_now_mat.tail(1)['PER'][0] self.per_now = float(per_now) # print( MA['PER']) # print('PER_NOW') # print(PER_NOW) # print("PER_MAX") # print(PER_MAX) #print("PER_MIN") #print(per_min) def smacross(self, mat): mat = DataFrame(mat.dropna()) self.ma_smacross = DataFrame(mat, columns=['Stock', 'ZERO', 'DIF_SMA']) self.ma_smacross['ZERO'] = np.full(self.ma_smacross.shape[0], 0) self.ma_smacross['DIF_SMA'] = self.ma_smacross['Stock'] - self.dmatsma[ 'Stock'].rolling(window=self.s_idou, center=False).mean() #self.dmatsma['MA_S'] = self.dmatsma['Stock'].rolling(window=self.s_idou, center=False).mean() def bolinger(self, mat): mat = DataFrame(mat.dropna()) self.ma_boli = DataFrame(mat, columns=[ 'Stock', 'MA_M', 'B_U1', 'B_U2', 'B_U3', 'B_L1', 'B_L2', 'B_L3', 'B_STV' ]) self.ma_boli['MA_M'] = self.ma_boli['Stock'].rolling( window=self.m_idou, center=False).mean() self.ma_boli['B_STV'] = self.ma_boli['Stock'].rolling( window=self.m_idou, center=False).std() self.ma_boli['B_U1'] = self.ma_boli['MA_M'] + self.ma_boli['B_STV'] self.ma_boli['B_U2'] = self.ma_boli['MA_M'] + 2 * self.ma_boli['B_STV'] self.ma_boli['B_U3'] = self.ma_boli['MA_M'] + 3 * self.ma_boli['B_STV'] self.ma_boli['B_L1'] = self.ma_boli['MA_M'] - self.ma_boli['B_STV'] self.ma_boli['B_L2'] = self.ma_boli['MA_M'] - 2 * self.ma_boli['B_STV'] self.ma_boli['B_L3'] = self.ma_boli['MA_M'] - 3 * self.ma_boli['B_STV'] #print("self.ma_boli") #print(self.ma_boli) #MA['Stock'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='-', marker='*', color='blue') #MA['MA_M'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='red') #MA['B_U1'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black') #MA['B_U2'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black') #MA['B_U3'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black') #MA['B_L1'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black') #MA['B_L2'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black') #MA['B_L3'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black') #ax40.set_xlim([datetime.datetime(Y, M - 2, 1), datetime.datetime(Y, M, D)]) # 縦軸の表示範囲。2か月前の1日から現在まで。1日にしたのはバグ回避 #ax40.set_ylabel("Bolinger") #ax40.yaxis.tick_right() #ax40.legend_.remove() def smaweekget(self, mat): s_idou_w = setting.S_IDOU_W m_idou_w = setting.M_IDOU_W l_idou_w = setting.L_IDOU_W mat = DataFrame(mat.dropna()) #print(mat) self.ma_smawkget = DataFrame(mat, columns=['Stock', 'MA_S', 'MA_M', 'MA_L']) self.ma_smawkget['MA_S'] = self.ma_smawkget['Stock'].rolling( window=s_idou_w, center=False).mean() self.ma_smawkget['MA_M'] = self.ma_smawkget['Stock'].rolling( window=m_idou_w, center=False).mean() self.ma_smawkget['MA_L'] = self.ma_smawkget['Stock'].rolling( window=l_idou_w, center=False).mean() def kairi(self, mat): #U0_NUM=5 #U1_NUM=10 self.m_idou = setting.M_IDOU mat = DataFrame(mat.dropna()) self.ma_kairi = DataFrame(mat, columns=[ 'Stock', 'MA_M', 'KAIRI', 'MAX', 'MIN', 'L1', 'L2', 'L3', 'ZERO' ]) self.ma_kairi['MA_M'] = self.dmatsma['Stock'].rolling( window=self.m_idou, center=False).mean() self.ma_kairi['KAIRI'] = (self.ma_kairi['Stock'].tail(200) / self.ma_kairi['MA_M'].tail(200)) * 100 - 100 self.kairi_max = round(np.max(self.ma_kairi['KAIRI']), 1) self.kairi_min = round(np.min(self.ma_kairi['KAIRI']), 1) #print("MAX") #print(self.kairi_max) #print("MIN") #print(self.kairi_min) self.ma_kairi['MAX'] = self.kairi_max self.ma_kairi['MIN'] = self.kairi_min self.ma_kairi['L1'] = self.kairi_min + (self.kairi_max - self.kairi_min) / 4 self.ma_kairi['L2'] = self.kairi_min + 2 * (self.kairi_max - self.kairi_min) / 4 self.ma_kairi['L3'] = self.kairi_min + 3 * (self.kairi_max - self.kairi_min) / 4 self.ma_kairi['ZERO'] = 0 self.kairi_sig = np.std( self.ma_kairi['KAIRI']) #過去1年の株価のボラティリティーの標準偏差計算 self.kairi_mean = np.mean( self.ma_kairi['KAIRI']) #過去1年の株価のボラティリティーの平均計算抽出 #print("self.ma_kairi") #print(self.ma_kairi) kairi_today = round(self.ma_kairi['KAIRI'].tail(1)[0], 1) #本日の乖離率 if kairi_today <= self.kairi_min + 1 * (self.kairi_max - self.kairi_min) / 4: self.kairi_rank = 0 elif kairi_today > self.kairi_min + 1 * ( self.kairi_max - self.kairi_min) / 4 and kairi_today <= self.kairi_min + 2 * ( self.kairi_max - self.kairi_min) / 4: self.kairi_rank = 1 elif kairi_today > self.kairi_min + 2 * ( self.kairi_max - self.kairi_min) / 4 and kairi_today <= self.kairi_min + 3 * ( self.kairi_max - self.kairi_min) / 4: self.kairi_rank = 2 elif kairi_today > self.kairi_min + 3 * ( self.kairi_max - self.kairi_min ) / 4 and kairi_today <= self.kairi_min + 4 * ( self.kairi_max - self.kairi_min ) / 4 + 0.1: # kairi_today=kairi_maxにてエラー発生。計算誤差の影響と思われる。+0.1を追加しエラーから逃げた。2019/01/15 self.kairi_rank = 3 #print("kairi_today") #print(kairi_today) #print("kairi_min") #print(self.kairi_min) #print("kairi_max") #print(self.kairi_max) #print("kairi_rank") #print(self.kairi_rank) #print("ma_kairi") #print(self.ma_kairi) #self.dmatsma['KAIRI'] = 100*math.log(self.dmatsma['MA_M'].tail(60)/self.dmatsma['Stock'].tail(60)) #print() #print(self.dmatsma) def combine(self): #今日-昨日の株価 , #昨日-おとといの株価の計算を含む #self.comb = [round(self.volatirity[0],2),round(self.stock_exp,2), self.stock_now['Stock'][0], self.stock_kinou, self.stock_ototoi, round(self.per_max,2), round(self.per_min,2),round(self.per_now,2),self.stocknum # , self.stock_n05sig['Stock'][0], self.stock_p05sig['Stock'][0], self.stock_n1sig['Stock'][0], self.stock_p1sig['Stock'][0], self.smatoday,self.kairi_max,self.kairi_min,self.kairi_rank] #1week前-2week前 , #2week前-3week前 株価の計算を含む self.comb = [ round(self.volatirity[0], 2), round(self.stock_exp, 2), self.stock_now['Stock'][0], self.dif_2_1_week, self.dif_3_2_week, round(self.per_max, 2), round(self.per_min, 2), round(self.per_now, 2), self.stocknum, self.stock_n05sig['Stock'][0], self.stock_p05sig['Stock'][0], self.stock_n1sig['Stock'][0], self.stock_p1sig['Stock'][0], self.smatoday, self.kairi_max, self.kairi_min, self.kairi_rank, self.stock_kinou ]
import pandas as pd from pandas import Series, DataFrame import numpy as np revenue = pd.read_clipboard() print(revenue) print('\n', revenue.columns) print('\n', revenue['Walmart']) print(DataFrame(revenue, columns=['1', 'Walmart', 'Retail'])) revenue2 = DataFrame(revenue, columns=['1', 'Walmart', 'Retail', 'yo']) print(revenue2) print(revenue2.head(2)) print(revenue2.tail(2)) print(revenue.ix[0]) arraynp = np.array([100, 200, 300, 400, 500]) revenue2['yo'] = arraynp print(revenue2) arrayseries = Series([200, 500], index=[2, 3]) revenue2['yo'] = arrayseries print(revenue2) del revenue2['yo'] print(revenue2) sample = {'company': ['a', 'b'], 'profit': [1000, 2000]} print(sample)
def _plot_line(df: pd.DataFrame, head: int, line_name: str, y: str = "close"): if head: df = df.tail(head) plot_data = go.Scatter(x=df["day"], y=df[y], name=line_name) return plot_data
class Results: """ A very simple database of results with a notification for new results. The new results are fed directly by the :class:`.StrategyBase`, outside of the :class:`.EventBus`. .. Note:: Later on, maybe this will be a cool actual database which allows to persistently store past evaluations for a given problem. This would allow resuming and further a-posteriory analysis. In the meantime, this is a pandas DataFrame. """ def __init__(self, strategy): self.logger = strategy.config.get_logger('RSLTS') self.strategy = strategy self.eventbus = strategy.eventbus self.problem = strategy.problem self.results = None self._last_nb = 0 # for logging def add_results(self, new_results): """ Add one single or a list of new @Result objects. Then, publish a ``new_result`` event. """ from pandas import (DataFrame, MultiIndex) if self.results is None: if len(new_results) == 0: return r = new_results[0] midx_x = [('x', _) for _ in range(len(r.x))] len_cv_vec = 0 if r.cv_vec is None else len(r.cv_vec) midx_cv = [('cv', _) for _ in range(len_cv_vec)] midx = MultiIndex.from_tuples( midx_x + [('fx', 0)] + midx_cv + [('cv', 0), ('who', 0), ('error', 0)]) self.results = DataFrame(columns=midx) assert all([isinstance(_, Result) for _ in new_results]) # notification for all received results at once self.eventbus.publish("new_results", results=new_results) new_rows = [] for r in new_results: new_rows.append( np.r_[r.x, r.fx, [] if r.cv_vec is None else r.cv_vec, [r.cv, r.who, r.error]]) results_new = DataFrame(new_rows, columns=self.results.columns) self.results = self.results.append(results_new, ignore_index=True) if len(self.results) / 100 > self._last_nb / 100: self.info() self._last_nb = len(self.results) def info(self): self.logger.info("%d results in DB" % len(self)) self.logger.debug("Dataframe Results:\n%s" % self.results.tail(3)) def __iadd__(self, results): self.add_results(results) return self def __len__(self): return len(self.results) if self.results is not None else 0
#2.3创建时间序列 #pandas.date_range(start=None, end=None, periods=None, freq='D', # tz=None, normalize=False, name=None, closed=None, **kwargs) dates=pd.date_range('20180101',periods=12,freq='m') print (dates) np.random.seed(5) df=pd.DataFrame(np.random.randn(12,4),index=dates, columns=list('ABCD')) df #查看数据头n行 ,默认n=5 df.head() #查看数据最后3行 df.tail(3) #查看数据的index(索引),columns (列名)和数据 print(df.index) print(df.columns) print(df.values) #数据转置 # df.T #根据索引排序数据排序:(按行axis=0或列axis=1) df.sort_index(axis=1,ascending=False) #按某列的值排序
class mcmc(CovmatSampler): r""" Adaptive, speed-hierarchy-aware MCMC sampler (adapted from CosmoMC) \cite{Lewis:2002ah,Lewis:2013hha}. """ _at_resume_prefer_new = CovmatSampler._at_resume_prefer_new + [ "burn_in", "callback_function", "callback_every", "max_tries", "output_every", "learn_every", "learn_proposal_Rminus1_max", "learn_proposal_Rminus1_max_early", "learn_proposal_Rminus1_min", "max_samples", "Rminus1_stop", "Rminus1_cl_stop", "Rminus1_cl_level", "covmat", "covmat_params"] _at_resume_prefer_old = CovmatSampler._at_resume_prefer_new + [ "proposal_scale", "blocking"] # instance variables from yaml burn_in: NumberWithUnits learn_every: NumberWithUnits output_every: NumberWithUnits callback_every: NumberWithUnits max_tries: NumberWithUnits max_samples: int drag: bool callback_function: Optional[callable] blocking: Optional[Sequence] proposal_scale: float learn_proposal: bool learn_proposal_Rminus1_max_early: float Rminus1_cl_level: float Rminus1_stop: float Rminus1_cl_stop: float Rminus1_single_split: int learn_proposal_Rminus1_min: float measure_speeds: bool oversample_thin: int oversample_power: float def set_instance_defaults(self): super().set_instance_defaults() # checkpoint variables self.converged = None self.mpi_size = None self.Rminus1_last = np.inf def initialize(self): """Initializes the sampler: creates the proposal distribution and draws the initial sample.""" if not self.model.prior.d(): raise LoggedError(self.log, "No parameters being varied for sampler") self.log.debug("Initializing") # MARKED FOR DEPRECATION IN v3.0 if getattr(self, "oversample", None) is not None: self.log.warning("*DEPRECATION*: `oversample` will be deprecated in the " "next version. Oversampling is now requested by setting " "`oversample_power` > 0.") # END OF DEPRECATION BLOCK # MARKED FOR DEPRECATION IN v3.0 if getattr(self, "check_every", None) is not None: self.log.warning("*DEPRECATION*: `check_every` will be deprecated in the " "next version. Please use `learn_every` instead.") # BEHAVIOUR TO BE REPLACED BY ERROR: self.learn_every = getattr(self, "check_every") # END OF DEPRECATION BLOCK if self.callback_every is None: self.callback_every = self.learn_every self._quants_d_units = [] for q in ["max_tries", "learn_every", "callback_every", "burn_in"]: number = NumberWithUnits(getattr(self, q), "d", dtype=int) self._quants_d_units.append(number) setattr(self, q, number) self.output_every = NumberWithUnits(self.output_every, "s", dtype=int) if is_main_process(): if self.output.is_resuming() and ( max(self.mpi_size or 0, 1) != max(get_mpi_size(), 1)): raise LoggedError( self.log, "Cannot resume a run with a different number of chains: " "was %d and now is %d.", max(self.mpi_size or 0, 1), max(get_mpi_size(), 1)) if more_than_one_process(): if get_mpi().Get_version()[0] < 3: raise LoggedError(self.log, "MPI use requires MPI version 3.0 or " "higher to support IALLGATHER.") sync_processes() # One collection per MPI process: `name` is the MPI rank + 1 name = str(1 + (lambda r: r if r is not None else 0)(get_mpi_rank())) self.collection = Collection( self.model, self.output, name=name, resuming=self.output.is_resuming()) self.current_point = OneSamplePoint(self.model) # Use standard MH steps by default self.get_new_sample = self.get_new_sample_metropolis # Prepare callback function if self.callback_function is not None: self.callback_function_callable = ( get_external_function(self.callback_function)) # Useful for getting last points added inside callback function self.last_point_callback = 0 # Monitoring/restore progress if is_main_process(): cols = ["N", "timestamp", "acceptance_rate", "Rminus1", "Rminus1_cl"] self.progress = DataFrame(columns=cols) self.i_learn = 1 if self.output and not self.output.is_resuming(): with open(self.progress_filename(), "w", encoding="utf-8") as progress_file: progress_file.write("# " + " ".join(self.progress.columns) + "\n") # Get first point, to be discarded -- not possible to determine its weight # Still, we need to compute derived parameters, since, as the proposal "blocked", # we may be saving the initial state of some block. # NB: if resuming but nothing was written (burn-in not finished): re-start if self.output.is_resuming() and len(self.collection): initial_point = (self.collection[self.collection.sampled_params] .iloc[len(self.collection) - 1]).values.copy() logpost = -(self.collection[_minuslogpost] .iloc[len(self.collection) - 1].copy()) logpriors = -(self.collection[self.collection.minuslogprior_names] .iloc[len(self.collection) - 1].copy()) loglikes = -0.5 * (self.collection[self.collection.chi2_names] .iloc[len(self.collection) - 1].copy()) derived = (self.collection[self.collection.derived_params] .iloc[len(self.collection) - 1].values.copy()) else: # NB: max_tries adjusted to dim instead of #cycles (blocking not computed yet) self.max_tries.set_scale(self.model.prior.d()) self.log.info("Getting initial point... (this may take a few seconds)") initial_point, logpost, logpriors, loglikes, derived = \ self.model.get_valid_point(max_tries=self.max_tries.value) # If resuming but no existing chain, assume failed run and ignore blocking # if speeds measurement requested if self.output.is_resuming() and not len(self.collection) \ and self.measure_speeds: self.blocking = None if self.measure_speeds and self.blocking: if is_main_process(): self.log.warning( "Parameter blocking manually fixed: speeds will not be measured.") elif self.measure_speeds: n = None if self.measure_speeds is True else int(self.measure_speeds) self.model.measure_and_set_speeds(n=n, discard=0) self.set_proposer_blocking() self.set_proposer_covmat(load=True) self.current_point.add(initial_point, derived=derived, logpost=logpost, logpriors=logpriors, loglikes=loglikes) self.log.info("Initial point: %s", self.current_point) # Max #(learn+convergence checks) to wait, # in case one process dies without sending MPI_ABORT self.been_waiting = 0 self.max_waiting = max(50, self.max_tries.unit_value) # Burning-in countdown -- the +1 accounts for the initial point (always accepted) self.burn_in_left = self.burn_in.value * self.current_point.output_thin + 1 # Initial dummy checkpoint # (needed when 1st "learn point" not reached in prev. run) self.write_checkpoint() @property def i_last_slow_block(self): if self.drag: return next(i for i, o in enumerate(self.oversampling_factors) if o != 1) - 1 self.log.warning("`i_last_slow_block` is only well defined when dragging.") return 0 @property def slow_blocks(self): return self.blocks[:1 + self.i_last_slow_block] @property def slow_params(self): return list(chain(*self.slow_blocks)) @property def n_slow(self): return len(self.slow_params) @property def fast_blocks(self): return self.blocks[self.i_last_slow_block + 1:] @property def fast_params(self): return list(chain(*self.fast_blocks)) @property def n_fast(self): return len(self.fast_params) @property def acceptance_rate(self): return self.n() / self.collection[_weight].sum() def set_proposer_blocking(self): if self.blocking: # Includes the case in which we are resuming self.blocks, self.oversampling_factors = \ self.model.check_blocking(self.blocking) else: self.blocks, self.oversampling_factors = \ self.model.get_param_blocking_for_sampler( oversample_power=self.oversample_power, split_fast_slow=self.drag) # Turn off dragging if one block, or if speed differences < 2x, or no differences if self.drag: if len(self.blocks) == 1: self.drag = False self.log.warning( "Dragging disabled: not possible if there is only one block.") if max(self.oversampling_factors) / min(self.oversampling_factors) < 2: self.drag = False self.log.warning( "Dragging disabled: speed ratios < 2.") if self.drag: # The definition of oversample_power=1 as spending the same amount of time in # the slow and fast block would suggest a 1/2 factor here, but this additional # factor of 2 w.r.t. oversampling should produce an equivalent exploration # efficiency. self.drag_interp_steps = int( np.round(self.oversampling_factors[self.i_last_slow_block + 1] * self.n_fast / self.n_slow)) if self.drag_interp_steps < 2: self.drag = False self.log.warning( "Dragging disabled: " "speed ratio and fast-to-slow ratio not large enough.") # Define proposer and other blocking-related quantities if self.drag: # MARKED FOR DEPRECATION IN v3.0 if getattr(self, "drag_limits", None) is not None: self.log.warning("*DEPRECATION*: 'drag_limits' has been deprecated. " "Use 'oversample_power' to control the amount of " "dragging steps.") # END OF DEPRECATION BLOCK self.get_new_sample = self.get_new_sample_dragging self.mpi_info("Dragging with number of interpolating steps:") max_width = len(str(self.drag_interp_steps)) self.mpi_info("* %" + "%d" % max_width + "d : %r", 1, self.slow_blocks) self.mpi_info("* %" + "%d" % max_width + "d : %r", self.drag_interp_steps, self.fast_blocks) elif np.any(np.array(self.oversampling_factors) > 1): self.mpi_info("Oversampling with factors:") max_width = len(str(max(self.oversampling_factors))) for f, b in zip(self.oversampling_factors, self.blocks): self.mpi_info("* %" + "%d" % max_width + "d : %r", f, b) if self.oversample_thin: self.current_point.output_thin = int(np.round(sum( len(b) * o for b, o in zip(self.blocks, self.oversampling_factors)) / self.model.prior.d())) # Save blocking in updated info, in case we want to resume self._updated_info["blocking"] = list(zip(self.oversampling_factors, self.blocks)) sampled_params_list = list(self.model.parameterization.sampled_params()) blocks_indices = [[sampled_params_list.index(p) for p in b] for b in self.blocks] self.proposer = BlockedProposer( blocks_indices, oversampling_factors=self.oversampling_factors, i_last_slow_block=(self.i_last_slow_block if self.drag else None), proposal_scale=self.proposal_scale) # Cycle length, taking into account oversampling/dragging if self.drag: self.cycle_length = self.n_slow else: self.cycle_length = sum(len(b) * o for b, o in zip(blocks_indices, self.oversampling_factors)) self.log.debug( "Cycle length in steps: %r", self.cycle_length) for number in self._quants_d_units: number.set_scale(self.cycle_length // self.current_point.output_thin) def set_proposer_covmat(self, load=False): if load: # Build the initial covariance matrix of the proposal, or load from checkpoint self._covmat, where_nan = self._load_covmat( prefer_load_old=self.output.is_resuming()) if np.any(where_nan) and self.learn_proposal: # We want to start learning the covmat earlier. self.mpi_info("Covariance matrix " + ("not present" if np.all(where_nan) else "not complete") + ". We will start learning the covariance of the proposal " "earlier: R-1 = %g (would be %g if all params loaded).", self.learn_proposal_Rminus1_max_early, self.learn_proposal_Rminus1_max) self.learn_proposal_Rminus1_max = self.learn_proposal_Rminus1_max_early self.log.debug( "Sampling with covmat:\n%s", DataFrame(self._covmat, columns=self.model.parameterization.sampled_params(), index=self.model.parameterization.sampled_params()).to_string( line_width=_line_width)) self.proposer.set_covariance(self._covmat) def _get_last_nondragging_block(self, blocks, speeds): # blocks and speeds are already sorted log_differences = np.zeros(len(blocks) - 1) for i in range(len(blocks) - 1): log_differences[i] = (np.log(np.min(speeds[:i + 1])) - np.log(np.min(speeds[i + 1:]))) i_max = np.argmin(log_differences) return i_max def _run(self): """ Runs the sampler. """ self.mpi_info( "Sampling!" + (" (NB: no accepted step will be saved until %d burn-in samples " % self.burn_in.value + "have been obtained)" if self.burn_in.value else "")) self.n_steps_raw = 0 last_output = 0 last_n = self.n() while last_n < self.max_samples and not self.converged: self.get_new_sample() self.n_steps_raw += 1 if self.output_every.unit: # if output_every in sec, print some info and dump at fixed time intervals now = datetime.datetime.now() now_sec = now.timestamp() if now_sec >= last_output + self.output_every.value: self.do_output(now) last_output = now_sec if self.current_point.weight == 1: # have added new point # Callback function n = self.n() if n != last_n: # and actually added last_n = n if (hasattr(self, "callback_function_callable") and not (max(n, 1) % self.callback_every.value) and self.current_point.weight == 1): self.callback_function_callable(self) self.last_point_callback = len(self.collection) # Checking convergence and (optionally) learning # the covmat of the proposal if self.check_all_ready(): self.check_convergence_and_learn_proposal() if is_main_process(): self.i_learn += 1 if last_n == self.max_samples: self.log.info("Reached maximum number of accepted steps allowed. " "Stopping.") # Make sure the last batch of samples ( < output_every (not in sec)) are written self.collection.out_update() if more_than_one_process(): Ns = (lambda x: np.array(get_mpi_comm().gather(x)))(self.n()) if not is_main_process(): Ns = [] else: Ns = [self.n()] self.mpi_info("Sampling complete after %d accepted steps.", sum(Ns)) def n(self, burn_in=False): """ Returns the total number of accepted steps taken, including or not burn-in steps depending on the value of the `burn_in` keyword. """ return len(self.collection) + (0 if not burn_in else self.burn_in.value - self.burn_in_left // self.current_point.output_thin + 1) def get_new_sample_metropolis(self): """ Draws a new trial point from the proposal pdf and checks whether it is accepted: if it is accepted, it saves the old one into the collection and sets the new one as the current state; if it is rejected increases the weight of the current state by 1. Returns: ``True`` for an accepted step, ``False`` for a rejected one. """ trial = self.current_point.values.copy() self.proposer.get_proposal(trial) try: logpost_trial, logprior_trial, loglikes_trial, derived = \ self.model.logposterior(trial) except: self.send_error_signal() raise accept = self.metropolis_accept(logpost_trial, self.current_point.logpost) self.process_accept_or_reject(accept, trial, derived, logpost_trial, logprior_trial, loglikes_trial) return accept def get_new_sample_dragging(self): """ Draws a new trial point in the slow subspace, and gets the corresponding trial in the fast subspace by "dragging" the fast parameters. Finally, checks the acceptance of the total step using the "dragging" pdf: if it is accepted, it saves the old one into the collection and sets the new one as the current state; if it is rejected increases the weight of the current state by 1. Returns: ``True`` for an accepted step, ``False`` for a rejected one. """ # Prepare starting and ending points *in the SLOW subspace* # "start_" and "end_" mean here the extremes in the SLOW subspace start_slow_point = self.current_point.values.copy() start_slow_logpost = self.current_point.logpost end_slow_point = start_slow_point.copy() self.proposer.get_proposal_slow(end_slow_point) self.log.debug("Proposed slow end-point: %r", end_slow_point) # Save derived parameters of delta_slow jump, in case I reject all the dragging # steps but accept the move in the slow direction only end_slow_logpost, end_slow_logprior, end_slow_loglikes, derived = ( self.model.logposterior(end_slow_point)) if end_slow_logpost == -np.inf: self.current_point.weight += 1 return False # trackers of the dragging current_start_point = start_slow_point current_end_point = end_slow_point current_start_logpost = start_slow_logpost current_end_logpost = end_slow_logpost current_end_logprior = end_slow_logprior current_end_loglikes = end_slow_loglikes # accumulators for the "dragging" probabilities to be metropolis-tested # at the end of the interpolation start_drag_logpost_acc = start_slow_logpost end_drag_logpost_acc = end_slow_logpost # start dragging for i_step in range(1, 1 + self.drag_interp_steps): self.log.debug("Dragging step: %d", i_step) # take a step in the fast direction in both slow extremes delta_fast = np.zeros(len(current_start_point)) self.proposer.get_proposal_fast(delta_fast) self.log.debug("Proposed fast step delta: %r", delta_fast) proposal_start_point = current_start_point + delta_fast proposal_end_point = current_end_point + delta_fast # get the new extremes for the interpolated probability # (reject if any of them = -inf; avoid evaluating both if just one fails) # Force the computation of the (slow blocks) derived params at the starting # point, but discard them, since they contain the starting point's fast ones, # not used later -- save the end point's ones. proposal_start_logpost = self.model.logposterior(proposal_start_point)[0] (proposal_end_logpost, proposal_end_logprior, proposal_end_loglikes, derived_proposal_end) = (self.model.logposterior(proposal_end_point) if proposal_start_logpost > -np.inf else (-np.inf, None, [], [])) if proposal_start_logpost > -np.inf and proposal_end_logpost > -np.inf: # create the interpolated probability and do a Metropolis test frac = i_step / (1 + self.drag_interp_steps) proposal_interp_logpost = ((1 - frac) * proposal_start_logpost + frac * proposal_end_logpost) current_interp_logpost = ((1 - frac) * current_start_logpost + frac * current_end_logpost) accept_drag = self.metropolis_accept(proposal_interp_logpost, current_interp_logpost) else: accept_drag = False self.log.debug("Dragging step: %s", ("accepted" if accept_drag else "rejected")) # If the dragging step was accepted, do the drag if accept_drag: current_start_point = proposal_start_point current_start_logpost = proposal_start_logpost current_end_point = proposal_end_point current_end_logpost = proposal_end_logpost current_end_logprior = proposal_end_logprior current_end_loglikes = proposal_end_loglikes derived = derived_proposal_end # In any case, update the dragging probability for the final metropolis test start_drag_logpost_acc += current_start_logpost end_drag_logpost_acc += current_end_logpost # Test for the TOTAL step accept = self.metropolis_accept(end_drag_logpost_acc / self.drag_interp_steps, start_drag_logpost_acc / self.drag_interp_steps) self.process_accept_or_reject( accept, current_end_point, derived, current_end_logpost, current_end_logprior, current_end_loglikes) self.log.debug("TOTAL step: %s", ("accepted" if accept else "rejected")) return accept def metropolis_accept(self, logp_trial, logp_current): """ Symmetric-proposal Metropolis-Hastings test. Returns: ``True`` or ``False``. """ if logp_trial == -np.inf: return False elif logp_trial > logp_current: return True else: return np.random.exponential() > (logp_current - logp_trial) def process_accept_or_reject(self, accept_state, trial=None, derived=None, logpost_trial=None, logprior_trial=None, loglikes_trial=None): """Processes the acceptance/rejection of the new point.""" if accept_state: # add the old point to the collection (if not burning or initial point) if self.burn_in_left <= 0: if self.current_point.add_to_collection(self.collection): self.log.debug("New sample, #%d: \n %s", self.n(), self.current_point) # Update chain files, if output_every *not* in sec if not self.output_every.unit: if self.n() % self.output_every.value == 0: self.collection.out_update() else: self.burn_in_left -= 1 self.log.debug("Burn-in sample:\n %s", self.current_point) if self.burn_in_left == 0 and self.burn_in: self.log.info("Finished burn-in phase: discarded %d accepted steps.", self.burn_in.value) # set the new point as the current one, with weight one self.current_point.add(trial, derived=derived, logpost=logpost_trial, logpriors=logprior_trial, loglikes=loglikes_trial) else: # not accepted self.current_point.weight += 1 # Failure criterion: chain stuck! (but be more permissive during burn_in) max_tries_now = self.max_tries.value * \ (1 + (10 - 1) * np.sign(self.burn_in_left)) if self.current_point.weight > max_tries_now: self.collection.out_update() self.send_error_signal() raise LoggedError( self.log, "The chain has been stuck for %d attempts. Stopping sampling. " "If this has happened often, try improving your " "reference point/distribution. Alternatively (though not advisable) " "make 'max_tries: np.inf' (or 'max_tries: .inf' in yaml).\n" "Current point: %s", max_tries_now, self.current_point) # Functions to check convergence and learn the covariance of the proposal distribution def check_all_ready(self): """ Checks if the chain(s) is(/are) ready to check convergence and, if requested, learn a new covariance matrix for the proposal distribution. """ msg_ready = ("Ready to check convergence" + (" and learn a new proposal covmat" if self.learn_proposal else "")) n = len(self.collection) # If *just* (weight==1) got ready to check+learn if not (n % self.learn_every.value) and n > 0: self.log.info("Learn + convergence test @ %d samples accepted.", n) if more_than_one_process(): self.been_waiting += 1 if self.been_waiting > self.max_waiting: self.send_error_signal() raise LoggedError( self.log, "Waiting for too long for all chains to be ready. " "Maybe one of them is stuck or died unexpectedly?") self.model.dump_timing() # If not MPI size > 1, we are ready if not more_than_one_process(): self.log.debug(msg_ready) return True # Error check in case any process already sent an error signal self.check_error_signal() # If MPI, tell the rest that we are ready -- we use a "gather" # ("reduce" was problematic), but we are in practice just pinging if not hasattr(self, "req"): # just once! self.all_ready = np.empty(get_mpi_size()) self.req = get_mpi_comm().Iallgather( np.array([1.]), self.all_ready) self.log.info(msg_ready + " (waiting for the rest...)") # If all processes are ready to learn (= communication finished) if self.req.Test() if hasattr(self, "req") else False: # Sanity check: actually all processes have finished assert np.all(self.all_ready == 1), ( "This should not happen! Notify the developers. (Got %r)", self.all_ready) if more_than_one_process() and is_main_process(): self.log.info("All chains are r" + msg_ready[1:]) delattr(self, "req") self.been_waiting = 0 # Another error check, in case the error occurred after sending "ready" signal self.check_error_signal() # Just in case, a barrier here sync_processes() return True return False def check_convergence_and_learn_proposal(self): """ Checks the convergence of the sampling process, and, if requested, learns a new covariance matrix for the proposal distribution from the covariance of the last samples. """ if more_than_one_process(): # Compute and gather means, covs and CL intervals of last half of chains use_first = int(self.n() / 2) mean = self.collection.mean(first=use_first) cov = self.collection.cov(first=use_first) mcsamples = self.collection._sampled_to_getdist_mcsamples(first=use_first) try: bound = np.array([[ mcsamples.confidence(i, limfrac=self.Rminus1_cl_level / 2., upper=which) for i in range(self.model.prior.d())] for which in [False, True]]).T success_bounds = True except: bound = None success_bounds = False Ns, means, covs, bounds, acceptance_rates = map( lambda x: np.array(get_mpi_comm().gather(x)), [self.n(), mean, cov, bound, self.acceptance_rate]) else: # Compute and gather means, covs and CL intervals of last m-1 chain fractions m = 1 + self.Rminus1_single_split cut = int(len(self.collection) / m) try: Ns = (m - 1) * [cut] means = np.array( [self.collection.mean(first=i * cut, last=(i + 1) * cut - 1) for i in range(1, m)]) covs = np.array( [self.collection.cov(first=i * cut, last=(i + 1) * cut - 1) for i in range(1, m)]) mcsamples_list = [ self.collection._sampled_to_getdist_mcsamples( first=i * cut, last=(i + 1) * cut - 1) for i in range(1, m)] except: self.log.info("Not enough points in chain to check convergence. " "Waiting for next checkpoint.") return acceptance_rates = self.acceptance_rate try: bounds = [np.array( [[mcs.confidence(i, limfrac=self.Rminus1_cl_level / 2., upper=which) for i in range(self.model.prior.d())] for which in [False, True]]).T for mcs in mcsamples_list] success_bounds = True except: bounds = None success_bounds = False # Compute convergence diagnostics if is_main_process(): self.progress.at[self.i_learn, "N"] = ( sum(Ns) if more_than_one_process() else self.n()) self.progress.at[self.i_learn, "timestamp"] = \ datetime.datetime.now().isoformat() acceptance_rate = ( np.average(acceptance_rates, weights=Ns) if more_than_one_process() else acceptance_rates) self.log.info(" - Acceptance rate: %.3f" + (" = avg(%r)" % list(acceptance_rates) if more_than_one_process() else ""), acceptance_rate) self.progress.at[self.i_learn, "acceptance_rate"] = acceptance_rate # "Within" or "W" term -- our "units" for assessing convergence # and our prospective new covariance matrix mean_of_covs = np.average(covs, weights=Ns, axis=0) # "Between" or "B" term # We don't weight with the number of samples in the chains here: # shorter chains will likely be outliers, and we want to notice them cov_of_means = np.atleast_2d(np.cov(means.T)) # , fweights=Ns) # For numerical stability, we turn mean_of_covs into correlation matrix: # rho = (diag(Sigma))^(-1/2) * Sigma * (diag(Sigma))^(-1/2) # and apply the same transformation to the mean of covs (same eigenvals!) # NB: disables warnings from numpy prev_err_state = deepcopy(np.geterr()) np.seterr(divide="ignore") diagSinvsqrt = np.diag(np.power(np.diag(cov_of_means), -0.5)) np.seterr(**prev_err_state) corr_of_means = diagSinvsqrt.dot(cov_of_means).dot(diagSinvsqrt) norm_mean_of_covs = diagSinvsqrt.dot(mean_of_covs).dot(diagSinvsqrt) success = False # Cholesky of (normalized) mean of covs and eigvals of Linv*cov_of_means*L try: L = np.linalg.cholesky(norm_mean_of_covs) except np.linalg.LinAlgError: self.log.warning( "Negative covariance eigenvectors. " "This may mean that the covariance of the samples does not " "contain enough information at this point. " "Skipping learning a new covmat for now.") else: Linv = np.linalg.inv(L) # Suppress numpy warnings (restored later in this function) error_handling = deepcopy(np.geterr()) np.seterr(all="ignore") try: eigvals = np.linalg.eigvalsh(Linv.dot(corr_of_means).dot(Linv.T)) success = True except np.linalg.LinAlgError: self.log.warning("Could not compute eigenvalues. " "Skipping learning a new covmat for now.") else: Rminus1 = max(np.abs(eigvals)) self.progress.at[self.i_learn, "Rminus1"] = Rminus1 # For real square matrices, a possible def of the cond number is: condition_number = Rminus1 / min(np.abs(eigvals)) self.log.debug(" - Condition number = %g", condition_number) self.log.debug(" - Eigenvalues = %r", eigvals) self.log.info( " - Convergence of means: R-1 = %f after %d accepted steps" % ( Rminus1, (sum(Ns) if more_than_one_process() else self.n())) + (" = sum(%r)" % list(Ns) if more_than_one_process() else "")) # Have we converged in means? # (criterion must be fulfilled twice in a row) if max(Rminus1, self.Rminus1_last) < self.Rminus1_stop: # Check the convergence of the bounds of the confidence intervals # Same as R-1, but with the rms deviation from the mean bound # in units of the mean standard deviation of the chains if success_bounds: Rminus1_cl = (np.std(bounds, axis=0).T / np.sqrt(np.diag(mean_of_covs))) self.log.debug(" - normalized std's of bounds = %r", Rminus1_cl) Rminus1_cl = np.max(Rminus1_cl) self.progress.at[self.i_learn, "Rminus1_cl"] = Rminus1_cl self.log.info( " - Convergence of bounds: R-1 = %f after %d " % ( Rminus1_cl, (sum(Ns) if more_than_one_process() else self.n())) + "accepted steps" + (" = sum(%r)" % list( Ns) if more_than_one_process() else "")) if Rminus1_cl < self.Rminus1_cl_stop: self.converged = True self.log.info("The run has converged!") self._Ns = Ns else: self.log.info("Computation of the bounds was not possible. " "Waiting until the next converge check.") np.seterr(**error_handling) else: mean_of_covs = np.empty((self.model.prior.d(), self.model.prior.d())) success = None Rminus1 = None # Broadcast and save the convergence status and the last R-1 of means success = share_mpi(success) if success: self.Rminus1_last, self.converged = share_mpi( (Rminus1, self.converged) if is_main_process() else None) # Do we want to learn a better proposal pdf? if self.learn_proposal and not self.converged: good_Rminus1 = (self.learn_proposal_Rminus1_max > self.Rminus1_last > self.learn_proposal_Rminus1_min) if not good_Rminus1: self.mpi_info("Convergence less than requested for updates: " "waiting until the next convergence check.") return if more_than_one_process(): get_mpi_comm().Bcast(mean_of_covs, root=0) else: mean_of_covs = covs[0] try: self.proposer.set_covariance(mean_of_covs) if is_main_process(): self.log.info(" - Updated covariance matrix of proposal pdf.") self.log.debug("%r", mean_of_covs) except: if is_main_process(): self.log.debug("Updating covariance matrix failed unexpectedly. " "waiting until next covmat learning attempt.") # Save checkpoint info self.write_checkpoint() def send_error_signal(self): """ Sends an error signal to the other MPI processes. """ for i_rank in range(get_mpi_size()): if i_rank != get_mpi_rank(): get_mpi_comm().isend(True, dest=i_rank, tag=_error_tag) def check_error_signal(self): """ Checks if any of the other process has sent an error signal, and fails. NB: This behaviour only shows up when running this sampler inside a Python script, not when running with `cobaya run` (in that case, the process raising an error will call `MPI_ABORT` and kill the rest. """ for i in range(get_mpi_size()): if i != get_mpi_rank(): from mpi4py import MPI status = MPI.Status() get_mpi_comm().iprobe(i, status=status) if status.tag == _error_tag: raise LoggedError(self.log, "Another process failed! Exiting.") def do_output(self, date_time): self.collection.out_update() msg = "Progress @ %s : " % date_time.strftime("%Y-%m-%d %H:%M:%S") msg += "%d steps taken" % self.n_steps_raw if self.burn_in_left and self.burn_in: # NB: burn_in_left = 1 even if no burn_in msg += " -- still burning in, %d accepted steps left." % self.burn_in_left else: msg += ", and %d accepted." % self.n() self.log.info(msg) def write_checkpoint(self): if is_main_process() and self.output: checkpoint_filename = self.checkpoint_filename() self.dump_covmat(self.proposer.get_covariance()) checkpoint_info = {kinds.sampler: {self.get_name(): dict([ ("converged", bool(self.converged)), ("Rminus1_last", self.Rminus1_last), ("burn_in", (self.burn_in.value # initial: repeat burn-in if not finished if not self.n() and self.burn_in_left else 0)), # to avoid overweighting last point of prev. run ("mpi_size", get_mpi_size())])}} yaml_dump_file(checkpoint_filename, checkpoint_info, error_if_exists=False) if not self.progress.empty: with open(self.progress_filename(), "a", encoding="utf-8") as progress_file: progress_file.write( self.progress.tail(1).to_string(header=False, index=False) + "\n") self.log.debug("Dumped checkpoint and progress info, and current covmat.") # Finally: returning the computed products ########################################### def products(self): """ Auxiliary function to define what should be returned in a scripted call. Returns: The sample ``Collection`` containing the accepted steps. """ products = {"sample": self.collection} if is_main_process(): products["progress"] = self.progress return products # Class methods @classmethod def output_files_regexps(cls, output, info=None, minimal=False): regexps = [output.collection_regexp(name=None)] if minimal: return [(r, None) for r in regexps] regexps += [ re.compile(output.prefix_regexp_str + re.escape(ext.lstrip(".")) + "$") for ext in [_checkpoint_extension, _progress_extension, _covmat_extension]] return [(r, None) for r in regexps] @classmethod def get_version(cls): return __version__ @classmethod def _get_desc(cls, info=None): if info is None: drag = None else: drag = info.get("drag", cls.get_defaults()["drag"]) drag_string = { True: r" using the fast-dragging procedure described in \cite{Neal:2005}", False: ""} # Unknown case (no info passed) drag_string[None] = " [(if drag: True)%s]" % drag_string[True] return ("Adaptive, speed-hierarchy-aware MCMC sampler (adapted from CosmoMC) " r"\cite{Lewis:2002ah,Lewis:2013hha}" + drag_string[drag] + ".")
def t_d_df(some_df: DataFrame) -> DataFrame: assert isinstance(some_df, DataFrame) return some_df.tail(1)
#coding:utf-8 from pandas import DataFrame import pandas as pd timeAll = DataFrame(pd.date_range('8/3/2016', periods=252, freq='1d')) print(timeAll.head()) timeAll.columns = ['day'] print(timeAll.tail()) basedata = DataFrame(columns=['day', 'sale', 'price', 'skuid']) print(basedata) #随机划分 训练集 和测试集 from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_trans, y, random_state=1, test_size=0.4, train_size=0.6)
def strategy_sma_crossover(self, short_term_sma_dataframe: pd.DataFrame, long_term_sma_dataframe: pd.DataFrame, short_term_moving_average: int, long_term_moving_average: int, sma_moving_average_lookback_time_period: int): print('Analyzing Simple Moving Average Strategy...') #Find stocks in which the short term moving average has crossed above the long term moving average within the most recent 20 days short_term_sma_dataframe = short_term_sma_dataframe.tail( sma_moving_average_lookback_time_period) long_term_sma_dataframe = long_term_sma_dataframe.tail( sma_moving_average_lookback_time_period) sma_cross_df1 = pd.merge(short_term_sma_dataframe, long_term_sma_dataframe, left_index=True, right_index=True) sma_cross_up_ticker_name_list = [] sma_cross_down_ticker_name_list = [] for ticker in self.sp_500_symbols_list: sma_cross_df2 = sma_cross_df1.filter(regex=rf"^{ticker}_") #Detect moving average cross up event or cross down event #https://stackoverflow.com/questions/28345261/python-and-pandas-moving-average-crossover #Shift the column values down one row. The last value is removed and the top column becomes NaN. previous_short_term_moving_average_df1 = sma_cross_df2[ f'{ticker}_{short_term_moving_average}_sma'].shift(1) previous_long_term_moving_average_df1 = sma_cross_df2[ f'{ticker}_{long_term_moving_average}_sma'].shift(1) short_term_sma_cross_up_df1 = ( (sma_cross_df2[f'{ticker}_{short_term_moving_average}_sma'] > sma_cross_df2[f'{ticker}_{long_term_moving_average}_sma']) & (previous_short_term_moving_average_df1 <= previous_long_term_moving_average_df1)) #Notice the greater than and lesser than symbols are reversed for the cross down dataframe short_term_sma_cross_down_df1 = ( (sma_cross_df2[f'{ticker}_{short_term_moving_average}_sma'] < sma_cross_df2[f'{ticker}_{long_term_moving_average}_sma']) & (previous_short_term_moving_average_df1 >= previous_long_term_moving_average_df1)) #Count the number of cross up events by counting the number of True values in the boolean array number_of_cross_up_events = np.count_nonzero( short_term_sma_cross_up_df1) number_of_cross_down_events = np.count_nonzero( short_term_sma_cross_down_df1) #If there is 1 or more True values (i.e., cross up events), then append the name of the ticker to sma_cross_up_ticker_name_list if number_of_cross_up_events >= 1: sma_cross_up_ticker_name_list.append(ticker) #Else if (elif) there is 1 or more True values (i.e., cross down events), then append the name of the ticker to sma_cross_down_ticker_name_list elif number_of_cross_down_events >= 1: sma_cross_down_ticker_name_list.append(ticker) else: pass print('Analyzing Simple Moving Average Strategy...DONE') return sma_cross_up_ticker_name_list, sma_cross_down_ticker_name_list
index=list('abcd'), columns=['one', 'two']) print(df) print(df.sum()) print(df.sum(axis=1)) # axis=1将按行计算,axis=0将按列计算 print(df.mean(axis=1, skipna=False)) print(df.describe()) obj = Series(['a', 'a', 'b', 'c'] * 4) print(obj) print(obj.describe()) all_data = {} for ticker in ['AAP', 'IBM', 'MSFT', 'GOOG']: all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()}) returns = price.pct_change() print(price.tail()) data = DataFrame({ 'Qu1': [1, 3, 4, 3, 4], 'Qu2': [2, 3, 1, 2, 3], 'Qu3': [1, 5, 2, 4, 4] }) print(data)
iris_target = iris.target # 格式整理 iris_target.shape = (150, 1) iris_all = np.hstack((iris_data, iris_target)) # 转化为DataFrame iris_data_df = DataFrame(iris_data, columns=feature_names) iris_target_df = DataFrame(iris_target, columns=['target']) iris_data_all_df = DataFrame(iris_all, columns=feature_names + ['target']) ''' 数据集基础信息的获取[以iris_data_df为例] ''' # 数据预览 print(iris_data_all_df.head()) # 默认为前5行 print(iris_data_all_df.tail()) # 默认为后5行 print(iris_data_all_df.sample(5)) # 随机抽取5行 # 数据描述 ''' 这里是处理好的数据集,所以数据格式比较完整,不用进一步的处理。 如有数据乱码或者出现缺失值等情况,我们当按照上一篇的方法进行适当的数据清洗。 ''' # print(iris_data_all_df.isnull().sum()) # 缺失值 print(iris_data_all_df.shape) # 大小 print(iris_data_all_df.dtypes) # 类型 print(iris_data_all_df.describe()) # 常见统计量的描述 print(iris_data_all_df.info()) # 多种信息 ''' 可视化的方法,来直观了解数据
prop_c_male('Leslie').plot() # <codecell> # I couldn't figure out a way of iterating over the names rather than names/sex combo in # a vectorized way. from itertools import islice names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None)) m = [(name_, ambi_names_pt[name_]['M']/(ambi_names_pt[name_]['F'] + ambi_names_pt[name_]['M'])) \ for name_ in names_to_calc] p_m_instant = DataFrame(dict(m)) p_m_instant.tail() # <codecell> # similar calculation except instead of looking at the proportions for a given year only, # we look at the cumulative number of male/female babies for given name from itertools import islice names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None)) m = [(name_, ambi_names_pt[name_]['M'].cumsum()/(ambi_names_pt[name_]['F'].cumsum() + ambi_names_pt[name_]['M'].cumsum())) \ for name_ in names_to_calc] p_m_cum = DataFrame(dict(m)) p_m_cum.tail()
def label(self, data: pd.DataFrame): # data.ta.sma(length=20, append=True) # data.ta.macd(append=True) # data.ta.percent_return(cumulative=True, append=True) data.ta.strategy(self.tastrat, append=True) print(data.tail(50))
def GetFollowsByCode_InFiles(filelist, code='SH600036'): global codemarket # print filelist code = CodeName_process(code) print 'code:', code name, follows_list = GetFollows_InFiles(filelist, code) print name.decode('gbk') csvfilename = get_stock_history_csv(code, name.decode('gbk')) print csvfilename if csvfilename == '': print 'csv file not found. exit.' return # print 'follows_list:', follows_list follows_chg_list = GetFollows_ProcessList(follows_list, csvfilename) xdata = zip(*follows_chg_list)[0] #get DataFrame from List df = DataFrame(follows_chg_list, index=xdata, columns=['DATE', 'CHG', 'CHG_PCT', 'PRICE', 'VOLUME']) # print df print df.tail(20) # print len(df) # print df.CHG.describe() CHG_mean = df.CHG.mean() print 'CHG_mean', CHG_mean # print [CHG_mean for x in range(10)] # return ##### # fig = plt.figure(figsize=(16,9)) # fig, (ax0, ax1) = plt.subplots(nrows=2, figsize=(16,9)) fig = plt.figure(figsize=(16, 8.5)) ax0 = fig.add_axes((0.1, 0.2, 0.8, 0.7)) #[left, bottom, width, height] # ax_left = ax0 ax_left = df.CHG.plot(ax=ax0, kind='bar', alpha=0.5, align='center', linewidth=2) ax0.plot([CHG_mean for x in range(len(df))], 'g--', linewidth=2) ax_left.set_ylabel('f') ax_right = df.PRICE.plot(ax=ax0, secondary_y=True, color='red', marker='v', linewidth=2, alpha=0.7) ax_right.set_ylabel('price') if codemarket == 0: value_str = GetStockInfo_fromFile( csv.reader(file('stockinfo_cn.csv', 'rb')), code).decode('gbk') plt.title(name.decode('gbk') + code + ' v' + value_str) else: plt.title(name.decode('gbk') + code) plt.xlabel('Date') # print type(plt.xlim()) # print type(xdata), xdata, xdata[0] list, listlabel = GetXticksList(xdata) ax_left.set_xticks(list) ax_left.set_xticklabels([]) #(listlabel, fontsize='small') # plt.legend() # fig.autofmt_xdate() # ax1.set_title('volume') # plt.subplot(223, axisbg='r') ax1 = fig.add_axes((0.1, 0.05, 0.8, 0.15), sharex=ax0) ax_volume = df.VOLUME.plot(ax=ax1, kind='bar', color='green', linewidth=1, alpha=0.7) ax_volume.set_xticklabels([]) ax_volume.set_xticklabels(listlabel, fontsize='small') ax_volume.set_xticks(list) ax_volume.set_ylabel('volume') ax1.plot([df.VOLUME.mean() for x in range(len(df))], 'g--', linewidth=2) # fig.subplots_adjust(bottom=0.8) # cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) # fig.colorbar(im, cax=cbar_ax) if not savepng: plt.show() else: fig.savefig(save_fname) #, dpi=140)
def GetFollowsByCode_InFiles(filelist, code = 'SH600036'): global codemarket global titleprefix # print filelist code = CodeName_process(code) print 'code:', code name, follows_list = GetFollows_InFiles(filelist, code) #print follows_list[-5:-1] print name #.decode('gbk') pricehistory = get_stock_history_csv(code, name) if pricehistory == []: print 'Get pricehistory failed. Exit.' return # print 'follows_list:', follows_list follows_chg_list = GetFollows_ProcessList(follows_list, pricehistory) xdata = zip(*follows_chg_list)[0] #get DataFrame from List df = DataFrame(follows_chg_list, index=xdata, columns=['DATE', 'CHG', 'CHG_PCT', 'PRICE', 'VOLUME']) # print df print df.tail(20) # print len(df) # print df.CHG.describe() CHG_mean = df.CHG.mean() print 'CHG_mean', CHG_mean # print [CHG_mean for x in range(10)] # return ##### # fig = plt.figure(figsize=(16,9)) # fig, (ax0, ax1) = plt.subplots(nrows=2, figsize=(16,9)) fig = plt.figure(figsize=(16,8.5)) ax0 = fig.add_axes((0.1, 0.2, 0.8, 0.7)) #[left, bottom, width, height] # ax_left = ax0 ax_left = df.CHG.plot(ax=ax0, kind='bar', alpha=0.5, align='center', linewidth=2) ax0.plot([CHG_mean for x in range(len(df))], 'g--', linewidth=2) ax_left.set_ylabel('f') ax_right = df.PRICE.plot(ax=ax0, secondary_y=True, color='red', marker='v', linewidth=2, alpha=0.7) ax_right.set_ylabel('price') value_str = str(get_stockinfo_volume(code)[0])+u'亿' follow_str = str(df.CHG[-1])+'/'+ str(int(CHG_mean)) plt.title(df.DATE[-1]+' '+titleprefix+' '+name+code+' v'+value_str+' F'+follow_str) plt.xlabel('Date') # print type(plt.xlim()) # print type(xdata), xdata, xdata[0] list, listlabel = GetXticksList(xdata) ax_left.set_xticks(list) ax_left.set_xticklabels([]) #(listlabel, fontsize='small') # plt.legend() # fig.autofmt_xdate() # ax1.set_title('volume') # plt.subplot(223, axisbg='r') ax1 = fig.add_axes((0.1, 0.05, 0.8, 0.15), sharex=ax0) ax_volume = df.VOLUME.plot(ax=ax1, kind='bar', color='green', linewidth=1, alpha=0.7) ax_volume.set_xticklabels([]) ax_volume.set_xticklabels(listlabel, fontsize='small') ax_volume.set_xticks(list) ax_volume.set_ylabel('volume') ax1.plot([df.VOLUME.mean() for x in range(len(df))], 'g--', linewidth=2) # fig.subplots_adjust(bottom=0.8) # cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) # fig.colorbar(im, cax=cbar_ax) if not savepng: plt.show() else: fig.savefig(save_fname) #, dpi=140)
df_train_label = pd.read_csv( join(dir_path, 'labels_train.csv'), header=0, index_col=0) df_train_label = df_train_label.loc[:, df_train_label.columns.intersection([ ticker_name])] df_train_label.columns = ['Y'] df_complete = pd.merge(merged_df, df_train_label[['Y']], on='Date') df_complete = df_complete.sort_values('Date') super_merged_df = super_merged_df.append(df_complete, ignore_index=True) count = super_merged_df.shape[0] print(count) print(super_merged_df.tail()) # 5 Splitting Data in training and testing x = super_merged_df[['Entwicklungsrate Preis t+10', 'Entwicklungsrate Preis t+20', 'Entwicklungsrate Preis t+30', 'Entwicklungsrate Preis t+40', 'Entwicklungsrate Preis t+50', 'Entwicklungsrate Preis t+60', 'Entwicklungsrate Preis t+70', 'Entwicklungsrate Preis t+80', 'Entwicklungsrate Preis t+90', 'Entwicklungsrate Volume t+10', 'Entwicklungsrate Volume t+20', 'Entwicklungsrate Volume t+30', 'Entwicklungsrate Volume t+40',
for i in ch: # print(i.tag) # {current}local ..... local_name = i.text # 속성을 잡는것이 아니라 값을 잡아야하니 text다. # print(local_name) ta = i.get('ta') desc = i.get('desc') datas += [[local_name, ta, desc]] # DataFrame에 집어넣으려고 list타입에 넣었다. print(datas) from pandas import DataFrame df = DataFrame(datas, columns=['지역', '온도', '기상상태']) print(df.head()) # 앞에서 몇개만 출력 print(df['지역']) # 모든 행의 지역만 출력 print(df.tail(3)) # 뒤에서 3개만 읽는다 # ===================위에서 두단계로 나뉜 작업을 하나로 간편하게 하는 방법 ================================================================= print('\n\n웹자료 읽어 바로 출력===============================================================') # import urllib.request webdata2 = urllib.request.urlopen('http://www.kma.go.kr/XML/weather/sfc_web_map.xml') xmlFile = etree.parse(webdata2) root = xmlFile.getroot() ndate = list(root[0].attrib.values()) # values값만 읽는다. print(ndate) print(ndate[0] + '년 ' + ndate[1] + '월 ' + ndate[2] + '일 ' + ndate[3] + '시') for child in root: for subChild in child: print(subChild.text + ' : ' + subChild.attrib.get('ta'))
# zscore normalization for petal length # standard deviation taken from description of the data def zScoreNorm(num): return ((num - m) / 1.76) iris_data_c = iris_data iris_data_c['Petal Length'] = iris_data_c['Petal Length'].apply(zScoreNorm) norm_zscore_data = iris_data_c['Petal Length'] print "norm_data" print norm_zscore_data print iris_data print iris_target iris_target['Species'] = iris_target['Species'].apply(flower_type) print iris_target.head() print iris_target.tail() iris = pd.concat([iris_data, iris_target], axis=1) print iris sns.pairplot(iris, hue='Species', size=2) sns.plt.show() sns.factorplot('Petal Length', data=iris, hue='Species', size=8, kind='count') sns.plt.show()
def zScoreNorm(num): return ((num - m)/1.76) iris_data_c = iris_data iris_data_c['Petal Length'] = iris_data_c['Petal Length'].apply(zScoreNorm) norm_zscore_data = iris_data_c['Petal Length'] print "norm_data" print norm_zscore_data print iris_data print iris_target iris_target['Species'] = iris_target['Species'].apply(flower_type) print iris_target.head() print iris_target.tail() iris = pd.concat([iris_data,iris_target],axis=1) print iris sns.pairplot(iris,hue='Species',size=2) sns.plt.show() sns.factorplot('Petal Length',data=iris,hue='Species',size=8,kind='count') sns.plt.show()
# Reference data by column name or method of dataframe variable print(aonao['NAO']) print(aonao.NAO) # Add column to dataframe aonao['Diff'] = aonao['AO'] - aonao['NAO'] # Show first several lines of new dataframe print(aonao.head()) # Remove column from dataframe del aonao['Diff'] # Show last few lines of dataframe print(aonao.tail()) # Show slice from dataframe print(aonao['1981-01':'1981-03']) # Complex indexing example import datetime aonao.loc[(aonao.AO > 0) & (aonao.NAO < 0) & (aonao.index > datetime.datetime(1980, 1, 1)) & (aonao.index < datetime.datetime(1989, 1, 1)), 'NAO'].plot(kind='barh') # # # NEXT SECTION OF TUTORIAL (STATISTICS)
obj = Series([1,2,3]) obj.reindex() data = DataFrame([[1,2,3],[4,5,6]]) data.drop() np.argsort() obj.rank() obj.sort_values() data.tail() data.cov() data.cov() data.corr() data.dropna() data.loc data.fillna() data.unstack()
'薪资': Series(data=salary_list,index=index_list), '婚姻状况': Series(data=marital_list,index=index_list) } df=DataFrame(dic) # 方法一:遍历薪水列 for value in df['薪资']: print(value) # 方法二:遍历薪水列 for index,row_data in df.iterrows(): print(row_data['薪资']) # 方法三:遍历薪水列 for col,col_data in df.iteritems(): if col == '薪资': print(col_data) # 获取最大薪资 for col,col_data in df.iteritems(): if col == '薪资': # 将薪资中的k去掉并转化成float类型 list1 = [float(value[:len(value)-1]) for value in col_data] # 排序 max_salary = sorted(list1,reverse=True)[0] print(max_salary) print('###################') df df.tail(2) df.head(2) df.values
def tail(df: DataFrame, *args, **kwargs) -> DataFrame: """ Convenience function for R users """ return df.tail( *args, **kwargs)
#access indexes and columns print(df.columns) print(df['Industry']) #multiple columns print(DataFrame(df, columns=['Rank', 'Industry', 'Name'])) #NaN values df2 = DataFrame(df, columns=['Rank', 'Industry', 'Name', 'Profit']) print("New dataFrame=") print(df2) #head and tail print(df2.head(4)) #prints first 5 rows print(df2.tail(4)) #prints last 5 rows #access rows in dataframe #print(df.ix[0]) #does not work print(df.iloc[0]) #first row print(df.loc[5]) #5th row #assign values to dataframe using numpy a1 = np.array([1, 2, 3, 4, 5, 6, 7, 8]) df2['Profit'] = a1 print(df2) #using series profit = Series([900, 100], index=[3, 5]) df2['Profit'] = profit print(df2)
def _undifference_timeseries(self, historical_data: pd.DataFrame, forecasted_data: list): return np.cumsum( historical_data.tail(1).value.to_list() + forecasted_data).tolist()[1:]
import pandas as pd import numpy as np from pandas import Series, DataFrame df = DataFrame(np.random.randn(1000, 5)) print(df) #basic observations print(df.head()) print(df.tail()) print(df.describe()) column = df[0] print(column.head()) print(column[np.abs(column) > 3]) #any -> aleast one element in the row print(df[(np.abs(df) > 3).any(1)]) df[(np.abs(df) > 3)] = np.sign(df) * 5 print(df.describe())
('A','b'): [0,1,2,3,-1], ('B','a'): [-20,-10,0,10,20], ('B','b'): [-200,-100,0,100,200]}) p mask = p.loc[:]<0 p[mask] = 1000 p # In[488]: # Set the seed np.random.seed(121) dframe = DataFrame(np.random.randn(1000, 5)) dframe.tail(5) dframe.describe() # In[489]: col_0 = dframe[0] col_0.tail() # Show me the values of this Series that is > 3 col_0[np.abs(col_0) > 3] # In[490]:
print(iris.DESCR) iris_data = DataFrame(X, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']) iris_target = DataFrame(Y, columns=['Species']) def flower(num): if num == 0: return 'Setosa' elif num == 1: return 'Vericolour' else: return 'Virginica' iris_target['Species'] = iris_target['Species'].apply(flower) print(iris_target.tail()) iris = pd.concat([iris_data, iris_target], axis=1) sns.pairplot(iris, hue='Species', size=2) plt.show() sns.factorplot('Petal Length', data=iris, hue='Species', size=10) plt.show() from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import train_test_split logreg = LogisticRegression() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state=3)
from pandas import Series, DataFrame __author__ = 'wangjj' __mtime__ = '20161010下午 11:04' data = {'Scala': 2003, 'Java': 1995, 'Python': 1991, 'Go': 2009} ser = Series(data) print ser print 'C' in ser print 'Go' in ser print ser.values.mean() print '----' datas = { 'name': [ 'Wangdachui', 'Linling', 'Niuyun'], 'pay': [ 4000, 5000, 6000]} dataFra=DataFrame(datas) print DataFrame(datas) print '----' print dataFra['name'] print '----' print dataFra.pay print '----' print dataFra.head(2) print '----' print dataFra.tail(2)
dframe years = [1990, 1991, 1992, 2008, 2012, 2015, 1987, 1969, 2013, 2008, 1999] # これを10年ごとにまとめてみます。 decade_bins = [1960, 1970, 1980, 1990, 2000, 2010, 2020] decade_cat = pd.cut(years, decade_bins) decade_cat.shape decade_cat.categories pd.value_counts(decade_cat) np.random.seed(12345) dframe = DataFrame(np.random.randn(1000, 4)) dframe.head() dframe.tail() dframe.describe() col = dframe[0] col.head() col[np.abs(col) > 3] np.abs(-3.33) dframe[(np.abs(dframe) > 3).any(1)] np.sign(dframe) dframe = DataFrame(np.arange(4 * 4).reshape((4, 4))) blender = np.random.permutation(4) blender dframe dframe.take(blender)
# In[195]: KM=[kmeans(X,k) for k in K] print type(KM),len(KM) # In[196]: KM_df=DataFrame(KM) print KM_df.head(1) # In[197]: print KM_df.tail(1) # In[198]: KM_df.shape # In[199]: KM_v1=KM_df[0] print type(KM_v1) # In[200]:
ctxs = [ dict(X=X, y=y, linear_regression=fct.fit, x_name=n) for n in range(10, n, h) ] return ctxs ## Get time execution # sklearn ElasticNet ctxs = get_vectors_elastic(100000) df = DataFrame( list(measure_time_dim('linear_regression(X, y)', ctxs, verbose=1))) df['fct'] = 'ElasticNet' print(df.tail(n=3)) dfs = [df] # naive & c++ implemantation for fct in [nv_regular_linreg, cpp_regular_linreg]: ctxs = get_vectors(fct, 100000) df = DataFrame( list( measure_time_dim( 'linear_regression(X, y, beta, alpha, L1_ratio, max_iter, tol, num_samples, num_features)', ctxs, verbose=1))) df['fct'] = fct.__name__ dfs.append(df)
df.to_csv('births1880.txt',index=False,header=False) Location = r'births1880.txt' df = read_csv(Location) print df print df.head() df = read_csv(Location,header=None) print df print df.tail() df = read_csv(Location, names = ['Names','Births']) print df.head() import os os.remove(Location) print df['Names'].unique() for x in df['Names'].unique(): print x print df['Names'].describe()
time_series = {} for code, d in zip(codes,data): d.index = d.DATE time_series[code] = d.VALUE merged_data = DataFrame(time_series) # Unequal length series print(merged_data) term_premium = merged_data['GS10'] - merged_data['GS1'] term_premium.name = 'Term' merged_data = merged_data.join(term_premium,how='outer') default_premium = merged_data['BAA'] - merged_data['AAA'] default_premium.name = 'Default' merged_data = merged_data.join(default_premium,how='outer') merged_data = merged_data.drop(['AAA','BAA','GS10','GS1'],axis=1) print(merged_data.tail()) quarterly = merged_data.dropna() print(quarterly.tail()) growth_rates_selector = ['GDPC1','INDPRO','CPILFESL'] growth_rates = quarterly[growth_rates_selector].pct_change() final = quarterly.drop(growth_rates_selector, axis=1).join(growth_rates) new_names = {'GDPC1':'GDP_growth','INDPRO':'IP_growth','CPILFESL':'Inflation','UNRATE':'Unemp_rate'} final = final.rename(columns = new_names ).dropna() final.to_hdf('FRED_data.h5','FRED',complevel=6,complib='zlib') final.to_excel('FRED_data.xlsx') ax = final[['GDP_growth','IP_growth','Unemp_rate']].plot(subplots=True) fig = ax[0].get_figure()
nao = np.loadtxt('norm.nao.monthly.b5001.current.ascii') dates_nao = pd.date_range('1950-01', periods=nao.shape[0], freq='M') NAO = Series(nao[:, 2], index=dates_nao) NAO.index aonao = DataFrame({'AO': AO, 'NAO': NAO}) aonao.plot(subplots=True) aonao.head() aonao['NAO'] aonao.NAO #Creating a column using the data within dataframe aonao['Diff'] = aonao['AO'] - aonao['NAO'] aonao.head() #Removing the column from a dataframe del aonao['Diff'] aonao.tail() aonao['1981-01':'1981-03'] #Selecting a specific timeframe import datetime aonao.loc[(aonao.AO > 0) & (aonao.NAO < 0) & (aonao.index > datetime.datetime(1980, 1, 1)) & (aonao.index < datetime.datetime(1989, 1, 1)), 'NAO'].plot(kind='barh') aonao.mean() aonao.max() aonao.min() aonao.mean(1)
import xlrd # xlsを読み込む際に必要 import numpy as np import sqlite3 # データフレームを作る smp = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nebada', 'Nebada'], 'year': [2000, 2001, 2002, 2001, 2002], 'pop': [1.5, 1.6, 1.7, 3.5, 4.3] } frame = DataFrame(smp) # データフレームの要素へのアクセス frame.year # frame$year frame['year'] # frame$year frame.head() # head frame.tail() # tail frame2 = DataFrame( smp, index=['one', 'two', 'three', 'four', 'five']) # インデックスを追加 frame2.ix['one'] frame2.describe() # summary print(frame2.describe()) # データを読み込む data = pd.read_csv('stock_px.csv') print(data) xlsx_file = pd.ExcelFile('stock_px.xlsx') # openpyxlのインストールが必要, xlsも可 xlsx_file.sheet_names data = xlsx_file.parse('stock_px') print(data) # web上のデータを読み込む→http://docs.scipy.org/doc/numpy/reference/generated/numpy.DataSource.html
from pandas import DataFrame from Data import grade_dic df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽']) # 전체에 대한 마지막 2줄만 추출 # 파라미터가 없을 경우 5줄이 기본 tail_data = df.tail() print(tail_data) print('-' * 30) tail_data2 = df.tail(2) print(tail_data2)