Exemple #1
0
def __diff(data: pd.DataFrame) -> pd.DataFrame:
    """
    Differentiate columns. The first row gets lost.
    """
    data['dmasl'] = data['masl'].diff()
    data['dlat'] = data['lat'].diff()
    data['dlon'] = data['lon'].diff()
    return data.tail(-1)
Exemple #2
0
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
avg_site = np.array([])
for i in site:
    avg_site = np.append(avg_site, np.mean(train["click"][train["site_category"] == i]))


# In[14]:

site_df = DataFrame({"site": site, "avg_click": avg_site})
site_df = site_df.sort(columns="avg_click")
plt.plot(range(len(site_df)), site_df["avg_click"], "bo", range(len(site_df)), site_df["avg_click"], "k")


# In[17]:

site_df.tail(2)


# In[21]:

t1["special_site"] = 0
t2["special_site"] = 0
test["special_site"] = 0
t1["special_site"][t1["site_category"] == "dedf689d"] = 1
t2["special_site"][t2["site_category"] == "dedf689d"] = 1
test["special_site"][test["site_category"] == "dedf689d"] = 1


# In[22]:

print sum(t1["special_site"]), sum(t2["special_site"]), sum(test["special_site"])
Exemple #4
0
def tail_view(data: pd.DataFrame) -> pd.DataFrame:
    return data.tail()
Exemple #5
0
df.to_csv('births1880.txt', index=False, header=False)

Location = r'births1880.txt'

df = read_csv(Location)

print df

print df.head()

df = read_csv(Location, header=None)

print df

print df.tail()

df = read_csv(Location, names=['Names', 'Births'])

print df.head()

import os

os.remove(Location)

print df['Names'].unique()

for x in df['Names'].unique():
    print x

print df['Names'].describe()
Exemple #6
0
ddd["retMsg"]
data = ddd["data"]
len(data)
data[0]
data[300]
######################################################
orderbook_json_file = "test1.json"
fp = open(orderbook_json_file, 'r')
json_string = fp.readlines()[0]
fp.close()
orderbook_json_dict = json.loads(json_string.decode('unicode_escape'))
data_list = orderbook_json_dict["data"]
data_list = data_list[300:]
dataSet = DataFrame(data_list)
# dataSet.head(10)
dataSet.tail(10)
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
def test_head_tail_empty():
    # test empty dataframe
    empty_df = DataFrame()
    tm.assert_frame_equal(empty_df.tail(), empty_df)
    tm.assert_frame_equal(empty_df.head(), empty_df)
Exemple #8
0
dictSeries=Series(myDict)
dictSeries

#Creating a data frame from dictionary
empDict={'id':[1,2,3,4],'name':   ['Mark','Ian','Sam','Rich'],'isManager':[False,True,False,True]}

## Data Structure : Data Frame from a dictionary
empDict={'id':[1,2,3,4]}
empDf=DataFrame(empDict)

#Access rows and columns 
empDf.name
empDf.name[2]
empDf[empDf.isManager == False]
empDf.head()
empDf.tail()
empDf.iloc[2,]

#Create new column
empDf.append(Series([5,False,'Derek',2],
                    index=['id','isManager','name','deptId'],
ignore_index=True)
empDf

#Deleting a column
empDf['dummy']=1
empDf
del empDf['dummy']
empDf

#Deleting a row
Exemple #9
0
# Note: .iloc is strictly integer position based
df.iloc[0:3]

# We can also select using the column name
df['Rev']
df[['Rev','test']]

# df['ColumnName'][inclusive:exclusive]
df['Rev'][0:3]

df['col'][5:]

df[['col', 'test']][:3]

'''
There is also a handy function to select the top and bottom records of a dataframe
'''
# Select top N number of records (default = 5)
df.head()

# Select bottom N number of records (default = 5)
df.tail()








            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)


#An empty dataframe with 'message' and 'class' headers
data = DataFrame({'message': [], 'class': []})

#Including the email details with the spam/ham classification in the dataframe
data = data.append(
    dataFrameFromDirectory(
        'F:/UTD/Machine Learning/Dataset1/hw2_train/train/ham', 'ham'))
data = data.append(
    dataFrameFromDirectory(
        'F:/UTD/Machine Learning/Dataset1/hw2_train/train/spam', 'spam'))

#Head and the Tail of 'data'
data.head()
print(data.tail())

vectoriser = CountVectorizer()
count = vectoriser.fit_transform(data['message'].values)
print(count)
Exemple #11
0
print("=" * 50)

#printing columns name
print(city_frame.columns)  #prints name of index of column
print("=" * 50)

#printing a index column
print(city_frame["Population"])
print("=" * 50)

#printing first 3 row
print(city_frame.head(3))
print("=" * 50)
#printing last two row
print(city_frame.tail(2))
print("=" * 50)

#printing row of a data frame
print(city_frame.ix[2])  #print third row
print("=" * 50)

#adding new column
#city_frame["Stadium"]=np.array(["Manchester","Liverpool","Chelsea"])
city_frame["Stadium"] = ["Manchester", "Liverpool", "Chelsea"]
#note both passing list of values and array works
print(city_frame)
print("=" * 50)

import webbrowser
website = "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html"
Exemple #12
0
class Technical(StockGet):
    def __init__(self, stocknum):
        #print("analyze0")
        super().__init__(stocknum)
        #print("analyze1")
        self.vol_get(self.db_s)  #Volatirity calculation
        self.getsymbol(self.db_s)  #日足情報
        self.week_getsymbol(self.db_sw)  #週足情報
        #print("analyze2")
        self.macd(self.db_s)
        self.smaget(self.db_s)
        #print("analyze3")
        self.getbps()
        #print("analyze4")
        self.getper(self.db_s)
        self.smacross(self.db_s)
        self.bolinger(self.db_s)
        self.smaweekget(self.db_sw)
        self.kairi(self.db_s)
        self.combine()
        #print("self.db_sw")
        #print(self.db_sw)

    def vol_get(self, mat):  #銘柄のVolatirityを取得する
        self.stock_vol = 100 * mat.pct_change()
        self.stock_vol = self.stock_vol.dropna()
        self.volatirity = np.std(self.stock_vol)  #過去1年の株価のボラティリティーの標準偏差計算
        self.mean_vol = np.mean(self.stock_vol)  #過去1年の株価のボラティリティーの平均計算抽出
        self.stockvolmat = DataFrame(self.stock_vol,
                                     columns=[
                                         'Stock', '+sigma', '-sigma',
                                         '+2sigma', '-2sigma', '+3sigma',
                                         '-3sigma'
                                     ])
        #print("stockvolmat")
        #print(len(stockvolmat))
        self.stockvolmat['+sigma'] = np.full(len(self.stockvolmat),
                                             self.mean_vol + self.volatirity)
        self.stockvolmat['+2sigma'] = np.full(
            len(self.stockvolmat), self.mean_vol + 2 * self.volatirity)
        self.stockvolmat['+3sigma'] = np.full(
            len(self.stockvolmat), self.mean_vol + 3 * self.volatirity)
        self.stockvolmat['-sigma'] = np.full(len(self.stockvolmat),
                                             self.mean_vol - self.volatirity)
        self.stockvolmat['-2sigma'] = np.full(
            len(self.stockvolmat), self.mean_vol - 2 * self.volatirity)
        self.stockvolmat['-3sigma'] = np.full(
            len(self.stockvolmat), self.mean_vol - 3 * self.volatirity)
        #self.vol = [self.stock_vol, self.volatirity]
        #print(stockvolmat)
        #return stockvolmat

    def macd(self, mat):  #MACDデータを取得する
        self.s_dur = setting.S_DUR  #MACD parameter
        self.l_dur = setting.L_DUR  #MACD parameter
        self.sig_dur = setting.SIG_DUR  #MACD parameter
        mat = DataFrame(mat.dropna())
        self.dmatmacd = DataFrame(mat,
                                  columns=[
                                      'Stock', 'LONG', 'SHORT', 'V_MACD',
                                      'V_SIG', 'DIF_MACD', 'ZERO'
                                  ])
        self.dmatmacd['ZERO'] = np.full(self.dmatmacd.shape[0], 0)
        self.dmatmacd['LONG'] = self.dmatmacd['Stock'].ewm(
            span=self.l_dur).mean()
        self.dmatmacd['SHORT'] = self.dmatmacd['Stock'].ewm(
            span=self.s_dur).mean()
        self.dmatmacd[
            'V_MACD'] = self.dmatmacd['SHORT'] - self.dmatmacd['LONG']
        self.dmatmacd['V_SIG'] = self.dmatmacd['V_MACD'].rolling(
            window=self.sig_dur, center=False).mean()
        self.dmatmacd[
            'DIF_MACD'] = self.dmatmacd['V_MACD'] - self.dmatmacd['V_SIG']

        #return self.dmatmacd
        #dmat['LONG'] = pd.ewma(dmat['Stock'], span=self.l_dur)
        #dmat['SHORT'] = pd.ewma(dmat['Stock'], span=self.s_dur)
        #dmat['V_MACD'] = dmat['SHORT'] - dmat['LONG']
        #dmat['V_SIG'] = pd.rolling_mean(MA['V_MACD'], self.sig_dur)
        #dmat['DIF_MACD'] = dmat['V_MACD'] - dmat['V_SIG']

    def smaget(self, mat):  #SMAデータを取得する。
        self.s_idou = setting.S_IDOU
        self.m_idou = setting.M_IDOU
        self.l_idou = setting.L_IDOU
        mat = DataFrame(mat.dropna())
        self.dmatsma = DataFrame(mat,
                                 columns=['Stock', 'MA_S', 'MA_M', 'MA_L'])
        self.dmatsma['MA_S'] = self.dmatsma['Stock'].rolling(
            window=self.s_idou, center=False).mean()
        self.dmatsma['MA_M'] = self.dmatsma['Stock'].rolling(
            window=self.m_idou, center=False).mean()
        self.dmatsma['MA_L'] = self.dmatsma['Stock'].rolling(
            window=self.l_idou, center=False).mean()
        self.dmat_2month = self.dmatsma['Stock'][
            datetime.datetime(self.Y, self.M, self.D) - datetime.timedelta(
                days=60):datetime.datetime(self.Y, self.M, self.D)]
        self.stockmax = np.nanmax(self.dmat_2month) * 1.02
        self.stockmin = np.nanmin(self.dmat_2month) * 0.98

        #MA['MA_M'] = pd.rolling_mean(MA['Stock'], m_idou)
        #MA['MA_L'] = pd.rolling_mean(MA['Stock'], l_idou)
        self.stock_p1sig = round(
            self.stock_now * (math.e**(self.volatirity / 100)), 0)
        self.stock_p05sig = round(
            self.stock_now * (math.e**(0.5 * self.volatirity / 100)), 0)
        self.stock_n1sig = round(
            self.stock_now * (math.e**(-self.volatirity / 100)), 0)
        self.stock_n05sig = round(
            self.stock_now * (math.e**(-0.5 * self.volatirity / 100)), 0)
        #print("self.dmatsma['MA_S']")
        #print(self.dmatsma['MA_S'])
        self.smatoday = self.dmatsma['MA_S'].tail(1)[0]
        #print("self.smatoday")
        #print(self.smatoday)

        #self.stockinfo = [stock_p1sig["Stock"][0], stock_p05sig["Stock"][0], stock_n1sig["Stock"][0], stock_n05sig["Stock"][0]]
        #print("self.stockinfo")
        #print(self.stockinfo)
        return self.dmatsma, self.stockmax, self.stockmin

    def getsymbol(self, mat):
        mat = DataFrame(mat.dropna())
        self.dmatstock = DataFrame(mat, columns=['Stock'])
        #print("self.dmatstock")
        #print(self.dmatstock)
        self.stock_old = self.dmatstock.head(1)  # 1年前の株価を取得
        self.stock_now = self.dmatstock.tail(1)  # 本日の株価を取得する
        self.stock_exp = 100 * math.log(
            self.stock_now['Stock'][0] / self.stock_old['Stock'][0],
            math.e)  #1年前の株価と本日の株価の変化率を計算
        self.stock_kinou = self.dmatstock[len(self.dmatstock) -
                                          2:len(self.dmatstock) -
                                          1]['Stock'][0]  #昨日の株価を取得する
        self.stock_ototoi = self.dmatstock[len(self.dmatstock) -
                                           3:len(self.dmatstock) -
                                           2]['Stock'][0]  # おとといの株価を取得する
        self.dif_kinou_ototoi = self.stock_kinou - self.stock_ototoi  # 昨日の株価 - おとといの株価
        self.dif_today_kinou = self.stock_now - self.stock_kinou  # 今日の株価 - 昨日の株価

        #print("self.stock_kinou")
        #print(self.stock_kinou)
        #print("self.stock_ototoi")
        #print(self.stock_ototoi)

        #self.volatirity=np.std(self.stock_vol) #過去1年の株価のボラティリティーの標準偏差計算
        #self.mean_vol=np.mean(self.stock_vol) #過去1年の株価のボラティリティーの平均計算抽出
        #print(self.volatirity)
        #print(self.mean)

    def week_getsymbol(self, mat):
        mat = DataFrame(mat.dropna())
        #print("week")
        #print(mat)

        self.dmatstock = DataFrame(mat, columns=['Stock'])
        self.stock_1weekmae = self.dmatstock[len(self.dmatstock) -
                                             2:len(self.dmatstock) -
                                             1]['Stock'][0]  #週足(1週間前)の株価を取得する
        self.stock_2weekmae = self.dmatstock[len(self.dmatstock) -
                                             3:len(self.dmatstock) -
                                             2]['Stock'][0]  #週足(2週間前)の株価を取得する
        self.stock_3weekmae = self.dmatstock[len(self.dmatstock) -
                                             4:len(self.dmatstock) -
                                             3]['Stock'][0]  #週足(3週間前)の株価を取得する
        self.dif_3_2_week = self.stock_2weekmae - self.stock_3weekmae  #週足(3週間前) -  週足(2週間前) を取得する
        self.dif_2_1_week = self.stock_1weekmae - self.stock_2weekmae  #週足(3週間前) -  週足(2週間前) を取得する
        #print("self.dif_3_2_week")
        #print(self.dif_3_2_week)
        #print("self.dif_2_1_week")
        #print(self.dif_2_1_week )

    def getbps(self):  #指定銘柄のBPS(一株当たりの利益)
        #print("self.bps")
        self.record_stockdb = pd.read_csv(
            setting.CSV_DB_PATH,
            names=setting.CSV_DB_PATH_COLUMNS)  #全登録銘柄の情報を読み込み
        #self.record_stockdb = pd.read_csv("/home/pi/Desktop/stock/sqltest.csv", names=setting.CSV_DB_PATH_COLUMNS) #全登録銘柄の情報を読み込み
        #print("record_stockdb")
        #print(self.record_stockdb)
        #print("record_stockdb end")
        self.getinfo_of_stock = DataFrame(
            self.record_stockdb.ix[self.record_stockdb['STOCK_NUM'] == int(
                self.stocknum)])  #対象銘柄1行を抽出
        #print("getinfo_of_stock")
        #print(self.getinfo_of_stock)
        #print("getinfo_of_stock end")
        self.getinfo_of_stock = self.getinfo_of_stock.set_index(
            'STOCK_NUM')  # STOCK_NUM列をindexに指定した
        #print("1")
        self.stockinfo = DataFrame(
            self.getinfo_of_stock['LASTYEAR_PROFIT_PER_STOCK']
        )  # 'LASTYEAR_PROFIT_PER_STOCK'列のみを抽出
        #print("self.stockinfo")
        #print(self.stockinfo)
        self.bps = self.stockinfo['LASTYEAR_PROFIT_PER_STOCK'][int(
            self.stocknum
        )]  # インデックス stockget_numの'LASTYEAR_PROFIT_PER_STOCK'列を取り出す
        #print("self.bps")
        #print(self.bps)
        #print(hen)

    def getper(self, mat):  #指定銘柄のPER推移を計算する。

        if self.bps == 0:
            print("SKIP of PER_Dict")
            self.per_now = 0
            self.per_max = 1
            self.per_min = 0
            #return round(per_now, 1), round(per_max, 1)
        else:
            mat = DataFrame(mat.dropna())
            self.maper = DataFrame(mat,
                                   columns=[
                                       'Stock', 'PER', 'PER_MIN', 'PER1',
                                       'PER2', 'PER3', 'PER4', 'PER_MAX'
                                   ])
            self.maper['PER'] = self.maper['Stock'] / float(
                self.bps)  # 取得株価についてPER計算実施
            # print('MA')
            # print(MA)
            self.maper['PER_MAX'] = np.nanmax(self.maper['PER'])
            self.maper['PER_MIN'] = np.nanmin(self.maper['PER'])
            self.maper['PER1'] = self.maper['PER_MIN'] + (
                self.maper['PER_MAX'] - self.maper['PER_MIN']) / 5
            self.maper['PER2'] = self.maper['PER_MIN'] + 2 * (
                self.maper['PER_MAX'] - self.maper['PER_MIN']) / 5
            self.maper['PER3'] = self.maper['PER_MIN'] + 3 * (
                self.maper['PER_MAX'] - self.maper['PER_MIN']) / 5
            self.maper['PER4'] = self.maper['PER_MIN'] + 4 * (
                self.maper['PER_MAX'] - self.maper['PER_MIN']) / 5

            self.per_max = self.maper['PER_MAX'][0]
            self.per1 = self.maper['PER1'][0]
            self.per2 = self.maper['PER2'][0]
            self.per3 = self.maper['PER3'][0]
            self.per4 = self.maper['PER4'][0]
            self.per_min = self.maper['PER_MIN'][0]

            per_now_mat = DataFrame(self.maper['PER'])
            per_now = per_now_mat.tail(1)['PER'][0]
            self.per_now = float(per_now)

            # print( MA['PER'])
            # print('PER_NOW')
            # print(PER_NOW)
            # print("PER_MAX")
            # print(PER_MAX)
            #print("PER_MIN")
            #print(per_min)

    def smacross(self, mat):
        mat = DataFrame(mat.dropna())
        self.ma_smacross = DataFrame(mat, columns=['Stock', 'ZERO', 'DIF_SMA'])
        self.ma_smacross['ZERO'] = np.full(self.ma_smacross.shape[0], 0)
        self.ma_smacross['DIF_SMA'] = self.ma_smacross['Stock'] - self.dmatsma[
            'Stock'].rolling(window=self.s_idou, center=False).mean()
        #self.dmatsma['MA_S'] = self.dmatsma['Stock'].rolling(window=self.s_idou, center=False).mean()

    def bolinger(self, mat):
        mat = DataFrame(mat.dropna())
        self.ma_boli = DataFrame(mat,
                                 columns=[
                                     'Stock', 'MA_M', 'B_U1', 'B_U2', 'B_U3',
                                     'B_L1', 'B_L2', 'B_L3', 'B_STV'
                                 ])
        self.ma_boli['MA_M'] = self.ma_boli['Stock'].rolling(
            window=self.m_idou, center=False).mean()
        self.ma_boli['B_STV'] = self.ma_boli['Stock'].rolling(
            window=self.m_idou, center=False).std()
        self.ma_boli['B_U1'] = self.ma_boli['MA_M'] + self.ma_boli['B_STV']
        self.ma_boli['B_U2'] = self.ma_boli['MA_M'] + 2 * self.ma_boli['B_STV']
        self.ma_boli['B_U3'] = self.ma_boli['MA_M'] + 3 * self.ma_boli['B_STV']
        self.ma_boli['B_L1'] = self.ma_boli['MA_M'] - self.ma_boli['B_STV']
        self.ma_boli['B_L2'] = self.ma_boli['MA_M'] - 2 * self.ma_boli['B_STV']
        self.ma_boli['B_L3'] = self.ma_boli['MA_M'] - 3 * self.ma_boli['B_STV']
        #print("self.ma_boli")
        #print(self.ma_boli)

        #MA['Stock'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='-', marker='*', color='blue')
        #MA['MA_M'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='red')
        #MA['B_U1'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black')
        #MA['B_U2'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black')
        #MA['B_U3'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black')
        #MA['B_L1'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black')
        #MA['B_L2'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black')
        #MA['B_L3'].plot(ax=ax40, figsize=(FST, FSY), legend=True, linestyle='--', marker='', color='black')
        #ax40.set_xlim([datetime.datetime(Y, M - 2, 1), datetime.datetime(Y, M, D)])  # 縦軸の表示範囲。2か月前の1日から現在まで。1日にしたのはバグ回避
        #ax40.set_ylabel("Bolinger")
        #ax40.yaxis.tick_right()
        #ax40.legend_.remove()

    def smaweekget(self, mat):
        s_idou_w = setting.S_IDOU_W
        m_idou_w = setting.M_IDOU_W
        l_idou_w = setting.L_IDOU_W
        mat = DataFrame(mat.dropna())
        #print(mat)
        self.ma_smawkget = DataFrame(mat,
                                     columns=['Stock', 'MA_S', 'MA_M', 'MA_L'])
        self.ma_smawkget['MA_S'] = self.ma_smawkget['Stock'].rolling(
            window=s_idou_w, center=False).mean()
        self.ma_smawkget['MA_M'] = self.ma_smawkget['Stock'].rolling(
            window=m_idou_w, center=False).mean()
        self.ma_smawkget['MA_L'] = self.ma_smawkget['Stock'].rolling(
            window=l_idou_w, center=False).mean()

    def kairi(self, mat):
        #U0_NUM=5
        #U1_NUM=10
        self.m_idou = setting.M_IDOU
        mat = DataFrame(mat.dropna())
        self.ma_kairi = DataFrame(mat,
                                  columns=[
                                      'Stock', 'MA_M', 'KAIRI', 'MAX', 'MIN',
                                      'L1', 'L2', 'L3', 'ZERO'
                                  ])

        self.ma_kairi['MA_M'] = self.dmatsma['Stock'].rolling(
            window=self.m_idou, center=False).mean()

        self.ma_kairi['KAIRI'] = (self.ma_kairi['Stock'].tail(200) /
                                  self.ma_kairi['MA_M'].tail(200)) * 100 - 100
        self.kairi_max = round(np.max(self.ma_kairi['KAIRI']), 1)
        self.kairi_min = round(np.min(self.ma_kairi['KAIRI']), 1)
        #print("MAX")
        #print(self.kairi_max)
        #print("MIN")
        #print(self.kairi_min)
        self.ma_kairi['MAX'] = self.kairi_max
        self.ma_kairi['MIN'] = self.kairi_min
        self.ma_kairi['L1'] = self.kairi_min + (self.kairi_max -
                                                self.kairi_min) / 4
        self.ma_kairi['L2'] = self.kairi_min + 2 * (self.kairi_max -
                                                    self.kairi_min) / 4
        self.ma_kairi['L3'] = self.kairi_min + 3 * (self.kairi_max -
                                                    self.kairi_min) / 4
        self.ma_kairi['ZERO'] = 0
        self.kairi_sig = np.std(
            self.ma_kairi['KAIRI'])  #過去1年の株価のボラティリティーの標準偏差計算
        self.kairi_mean = np.mean(
            self.ma_kairi['KAIRI'])  #過去1年の株価のボラティリティーの平均計算抽出
        #print("self.ma_kairi")
        #print(self.ma_kairi)

        kairi_today = round(self.ma_kairi['KAIRI'].tail(1)[0], 1)  #本日の乖離率
        if kairi_today <= self.kairi_min + 1 * (self.kairi_max -
                                                self.kairi_min) / 4:
            self.kairi_rank = 0
        elif kairi_today > self.kairi_min + 1 * (
                self.kairi_max -
                self.kairi_min) / 4 and kairi_today <= self.kairi_min + 2 * (
                    self.kairi_max - self.kairi_min) / 4:
            self.kairi_rank = 1
        elif kairi_today > self.kairi_min + 2 * (
                self.kairi_max -
                self.kairi_min) / 4 and kairi_today <= self.kairi_min + 3 * (
                    self.kairi_max - self.kairi_min) / 4:
            self.kairi_rank = 2
        elif kairi_today > self.kairi_min + 3 * (
                self.kairi_max - self.kairi_min
        ) / 4 and kairi_today <= self.kairi_min + 4 * (
                self.kairi_max - self.kairi_min
        ) / 4 + 0.1:  # kairi_today=kairi_maxにてエラー発生。計算誤差の影響と思われる。+0.1を追加しエラーから逃げた。2019/01/15
            self.kairi_rank = 3

        #print("kairi_today")
        #print(kairi_today)
        #print("kairi_min")
        #print(self.kairi_min)
        #print("kairi_max")
        #print(self.kairi_max)
        #print("kairi_rank")
        #print(self.kairi_rank)

        #print("ma_kairi")
        #print(self.ma_kairi)
        #self.dmatsma['KAIRI'] = 100*math.log(self.dmatsma['MA_M'].tail(60)/self.dmatsma['Stock'].tail(60))
        #print()
        #print(self.dmatsma)

    def combine(self):

        #今日-昨日の株価 , #昨日-おとといの株価の計算を含む
        #self.comb = [round(self.volatirity[0],2),round(self.stock_exp,2), self.stock_now['Stock'][0], self.stock_kinou, self.stock_ototoi, round(self.per_max,2), round(self.per_min,2),round(self.per_now,2),self.stocknum
        #             , self.stock_n05sig['Stock'][0], self.stock_p05sig['Stock'][0], self.stock_n1sig['Stock'][0], self.stock_p1sig['Stock'][0], self.smatoday,self.kairi_max,self.kairi_min,self.kairi_rank]

        #1week前-2week前 , #2week前-3week前 株価の計算を含む
        self.comb = [
            round(self.volatirity[0], 2),
            round(self.stock_exp, 2), self.stock_now['Stock'][0],
            self.dif_2_1_week, self.dif_3_2_week,
            round(self.per_max, 2),
            round(self.per_min, 2),
            round(self.per_now,
                  2), self.stocknum, self.stock_n05sig['Stock'][0],
            self.stock_p05sig['Stock'][0], self.stock_n1sig['Stock'][0],
            self.stock_p1sig['Stock'][0], self.smatoday, self.kairi_max,
            self.kairi_min, self.kairi_rank, self.stock_kinou
        ]
Exemple #13
0
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

revenue = pd.read_clipboard()
print(revenue)
print('\n', revenue.columns)
print('\n', revenue['Walmart'])

print(DataFrame(revenue, columns=['1', 'Walmart', 'Retail']))

revenue2 = DataFrame(revenue, columns=['1', 'Walmart', 'Retail', 'yo'])
print(revenue2)

print(revenue2.head(2))
print(revenue2.tail(2))
print(revenue.ix[0])

arraynp = np.array([100, 200, 300, 400, 500])
revenue2['yo'] = arraynp
print(revenue2)

arrayseries = Series([200, 500], index=[2, 3])
revenue2['yo'] = arrayseries
print(revenue2)

del revenue2['yo']
print(revenue2)

sample = {'company': ['a', 'b'], 'profit': [1000, 2000]}
print(sample)
 def _plot_line(df: pd.DataFrame, head: int, line_name: str, y: str = "close"):
     if head:
         df = df.tail(head)
     plot_data = go.Scatter(x=df["day"], y=df[y], name=line_name)
     return plot_data
Exemple #15
0
class Results:

    """
    A very simple database of results with a notification for new results.
    The new results are fed directly by the :class:`.StrategyBase`, outside of the
    :class:`.EventBus`.

    .. Note::

      Later on, maybe this will be a cool actual database which allows to
      persistently store past evaluations for a given problem.
      This would allow resuming and further a-posteriory analysis.
      In the meantime, this is a pandas DataFrame.
    """

    def __init__(self, strategy):
        self.logger = strategy.config.get_logger('RSLTS')
        self.strategy = strategy
        self.eventbus = strategy.eventbus
        self.problem = strategy.problem
        self.results = None
        self._last_nb = 0  # for logging

    def add_results(self, new_results):
        """
        Add one single or a list of new @Result objects.
        Then, publish a ``new_result`` event.
        """
        from pandas import (DataFrame, MultiIndex)
        if self.results is None:
            if len(new_results) == 0:
                return
            r = new_results[0]
            midx_x = [('x', _) for _ in range(len(r.x))]
            len_cv_vec = 0 if r.cv_vec is None else len(r.cv_vec)
            midx_cv = [('cv', _) for _ in range(len_cv_vec)]
            midx = MultiIndex.from_tuples(
                midx_x + [('fx', 0)] +
                midx_cv + [('cv', 0), ('who', 0), ('error', 0)])
            self.results = DataFrame(columns=midx)

        assert all([isinstance(_, Result) for _ in new_results])
        # notification for all received results at once
        self.eventbus.publish("new_results", results=new_results)

        new_rows = []
        for r in new_results:
            new_rows.append(
                np.r_[r.x, r.fx,
                      [] if r.cv_vec is None else r.cv_vec,
                      [r.cv, r.who, r.error]])
        results_new = DataFrame(new_rows, columns=self.results.columns)
        self.results = self.results.append(results_new, ignore_index=True)

        if len(self.results) / 100 > self._last_nb / 100:
            self.info()
            self._last_nb = len(self.results)

    def info(self):
        self.logger.info("%d results in DB" % len(self))
        self.logger.debug("Dataframe Results:\n%s" % self.results.tail(3))

    def __iadd__(self, results):
        self.add_results(results)
        return self

    def __len__(self):
        return len(self.results) if self.results is not None else 0
Exemple #16
0
#2.3创建时间序列
#pandas.date_range(start=None, end=None, periods=None, freq='D',
#   tz=None, normalize=False, name=None, closed=None, **kwargs)
dates=pd.date_range('20180101',periods=12,freq='m')
print (dates)

np.random.seed(5)
df=pd.DataFrame(np.random.randn(12,4),index=dates,
                 columns=list('ABCD'))
df

#查看数据头n行 ,默认n=5
df.head()

#查看数据最后3行
df.tail(3)

#查看数据的index(索引),columns (列名)和数据
print(df.index)

print(df.columns)

print(df.values)

#数据转置
# df.T

#根据索引排序数据排序:(按行axis=0或列axis=1)
df.sort_index(axis=1,ascending=False)

#按某列的值排序
Exemple #17
0
class mcmc(CovmatSampler):
    r"""
    Adaptive, speed-hierarchy-aware MCMC sampler (adapted from CosmoMC)
    \cite{Lewis:2002ah,Lewis:2013hha}.
    """
    _at_resume_prefer_new = CovmatSampler._at_resume_prefer_new + [
        "burn_in", "callback_function", "callback_every", "max_tries", "output_every",
        "learn_every", "learn_proposal_Rminus1_max", "learn_proposal_Rminus1_max_early",
        "learn_proposal_Rminus1_min", "max_samples", "Rminus1_stop", "Rminus1_cl_stop",
        "Rminus1_cl_level", "covmat", "covmat_params"]
    _at_resume_prefer_old = CovmatSampler._at_resume_prefer_new + [
        "proposal_scale", "blocking"]

    # instance variables from yaml
    burn_in: NumberWithUnits
    learn_every: NumberWithUnits
    output_every: NumberWithUnits
    callback_every: NumberWithUnits
    max_tries: NumberWithUnits
    max_samples: int
    drag: bool
    callback_function: Optional[callable]
    blocking: Optional[Sequence]
    proposal_scale: float
    learn_proposal: bool
    learn_proposal_Rminus1_max_early: float
    Rminus1_cl_level: float
    Rminus1_stop: float
    Rminus1_cl_stop: float
    Rminus1_single_split: int
    learn_proposal_Rminus1_min: float
    measure_speeds: bool
    oversample_thin: int
    oversample_power: float

    def set_instance_defaults(self):
        super().set_instance_defaults()
        # checkpoint variables
        self.converged = None
        self.mpi_size = None
        self.Rminus1_last = np.inf

    def initialize(self):
        """Initializes the sampler:
        creates the proposal distribution and draws the initial sample."""
        if not self.model.prior.d():
            raise LoggedError(self.log, "No parameters being varied for sampler")
        self.log.debug("Initializing")
        # MARKED FOR DEPRECATION IN v3.0
        if getattr(self, "oversample", None) is not None:
            self.log.warning("*DEPRECATION*: `oversample` will be deprecated in the "
                             "next version. Oversampling is now requested by setting "
                             "`oversample_power` > 0.")
        # END OF DEPRECATION BLOCK
        # MARKED FOR DEPRECATION IN v3.0
        if getattr(self, "check_every", None) is not None:
            self.log.warning("*DEPRECATION*: `check_every` will be deprecated in the "
                             "next version. Please use `learn_every` instead.")
            # BEHAVIOUR TO BE REPLACED BY ERROR:
            self.learn_every = getattr(self, "check_every")
        # END OF DEPRECATION BLOCK
        if self.callback_every is None:
            self.callback_every = self.learn_every
        self._quants_d_units = []
        for q in ["max_tries", "learn_every", "callback_every", "burn_in"]:
            number = NumberWithUnits(getattr(self, q), "d", dtype=int)
            self._quants_d_units.append(number)
            setattr(self, q, number)
        self.output_every = NumberWithUnits(self.output_every, "s", dtype=int)
        if is_main_process():
            if self.output.is_resuming() and (
                    max(self.mpi_size or 0, 1) != max(get_mpi_size(), 1)):
                raise LoggedError(
                    self.log,
                    "Cannot resume a run with a different number of chains: "
                    "was %d and now is %d.", max(self.mpi_size or 0, 1),
                    max(get_mpi_size(), 1))
            if more_than_one_process():
                if get_mpi().Get_version()[0] < 3:
                    raise LoggedError(self.log, "MPI use requires MPI version 3.0 or "
                                                "higher to support IALLGATHER.")
        sync_processes()
        # One collection per MPI process: `name` is the MPI rank + 1
        name = str(1 + (lambda r: r if r is not None else 0)(get_mpi_rank()))
        self.collection = Collection(
            self.model, self.output, name=name, resuming=self.output.is_resuming())
        self.current_point = OneSamplePoint(self.model)
        # Use standard MH steps by default
        self.get_new_sample = self.get_new_sample_metropolis
        # Prepare callback function
        if self.callback_function is not None:
            self.callback_function_callable = (
                get_external_function(self.callback_function))
        # Useful for getting last points added inside callback function
        self.last_point_callback = 0
        # Monitoring/restore progress
        if is_main_process():
            cols = ["N", "timestamp", "acceptance_rate", "Rminus1", "Rminus1_cl"]
            self.progress = DataFrame(columns=cols)
            self.i_learn = 1
            if self.output and not self.output.is_resuming():
                with open(self.progress_filename(), "w",
                          encoding="utf-8") as progress_file:
                    progress_file.write("# " + " ".join(self.progress.columns) + "\n")
        # Get first point, to be discarded -- not possible to determine its weight
        # Still, we need to compute derived parameters, since, as the proposal "blocked",
        # we may be saving the initial state of some block.
        # NB: if resuming but nothing was written (burn-in not finished): re-start
        if self.output.is_resuming() and len(self.collection):
            initial_point = (self.collection[self.collection.sampled_params]
                .iloc[len(self.collection) - 1]).values.copy()
            logpost = -(self.collection[_minuslogpost]
                        .iloc[len(self.collection) - 1].copy())
            logpriors = -(self.collection[self.collection.minuslogprior_names]
                          .iloc[len(self.collection) - 1].copy())
            loglikes = -0.5 * (self.collection[self.collection.chi2_names]
                               .iloc[len(self.collection) - 1].copy())
            derived = (self.collection[self.collection.derived_params]
                       .iloc[len(self.collection) - 1].values.copy())
        else:
            # NB: max_tries adjusted to dim instead of #cycles (blocking not computed yet)
            self.max_tries.set_scale(self.model.prior.d())
            self.log.info("Getting initial point... (this may take a few seconds)")
            initial_point, logpost, logpriors, loglikes, derived = \
                self.model.get_valid_point(max_tries=self.max_tries.value)
            # If resuming but no existing chain, assume failed run and ignore blocking
            # if speeds measurement requested
            if self.output.is_resuming() and not len(self.collection) \
                    and self.measure_speeds:
                self.blocking = None
            if self.measure_speeds and self.blocking:
                if is_main_process():
                    self.log.warning(
                        "Parameter blocking manually fixed: speeds will not be measured.")
            elif self.measure_speeds:
                n = None if self.measure_speeds is True else int(self.measure_speeds)
                self.model.measure_and_set_speeds(n=n, discard=0)
        self.set_proposer_blocking()
        self.set_proposer_covmat(load=True)
        self.current_point.add(initial_point, derived=derived, logpost=logpost,
                               logpriors=logpriors, loglikes=loglikes)
        self.log.info("Initial point: %s", self.current_point)
        # Max #(learn+convergence checks) to wait,
        # in case one process dies without sending MPI_ABORT
        self.been_waiting = 0
        self.max_waiting = max(50, self.max_tries.unit_value)
        # Burning-in countdown -- the +1 accounts for the initial point (always accepted)
        self.burn_in_left = self.burn_in.value * self.current_point.output_thin + 1
        # Initial dummy checkpoint
        # (needed when 1st "learn point" not reached in prev. run)
        self.write_checkpoint()

    @property
    def i_last_slow_block(self):
        if self.drag:
            return next(i for i, o in enumerate(self.oversampling_factors) if o != 1) - 1
        self.log.warning("`i_last_slow_block` is only well defined when dragging.")
        return 0

    @property
    def slow_blocks(self):
        return self.blocks[:1 + self.i_last_slow_block]

    @property
    def slow_params(self):
        return list(chain(*self.slow_blocks))

    @property
    def n_slow(self):
        return len(self.slow_params)

    @property
    def fast_blocks(self):
        return self.blocks[self.i_last_slow_block + 1:]

    @property
    def fast_params(self):
        return list(chain(*self.fast_blocks))

    @property
    def n_fast(self):
        return len(self.fast_params)

    @property
    def acceptance_rate(self):
        return self.n() / self.collection[_weight].sum()

    def set_proposer_blocking(self):
        if self.blocking:
            # Includes the case in which we are resuming
            self.blocks, self.oversampling_factors = \
                self.model.check_blocking(self.blocking)
        else:
            self.blocks, self.oversampling_factors = \
                self.model.get_param_blocking_for_sampler(
                    oversample_power=self.oversample_power, split_fast_slow=self.drag)
        # Turn off dragging if one block, or if speed differences < 2x, or no differences
        if self.drag:
            if len(self.blocks) == 1:
                self.drag = False
                self.log.warning(
                    "Dragging disabled: not possible if there is only one block.")
            if max(self.oversampling_factors) / min(self.oversampling_factors) < 2:
                self.drag = False
                self.log.warning(
                    "Dragging disabled: speed ratios < 2.")
        if self.drag:
            # The definition of oversample_power=1 as spending the same amount of time in
            # the slow and fast block would suggest a 1/2 factor here, but this additional
            # factor of 2 w.r.t. oversampling should produce an equivalent exploration
            # efficiency.
            self.drag_interp_steps = int(
                np.round(self.oversampling_factors[self.i_last_slow_block + 1] *
                         self.n_fast / self.n_slow))
            if self.drag_interp_steps < 2:
                self.drag = False
                self.log.warning(
                    "Dragging disabled: "
                    "speed ratio and fast-to-slow ratio not large enough.")
        # Define proposer and other blocking-related quantities
        if self.drag:
            # MARKED FOR DEPRECATION IN v3.0
            if getattr(self, "drag_limits", None) is not None:
                self.log.warning("*DEPRECATION*: 'drag_limits' has been deprecated. "
                                 "Use 'oversample_power' to control the amount of "
                                 "dragging steps.")
            # END OF DEPRECATION BLOCK
            self.get_new_sample = self.get_new_sample_dragging
            self.mpi_info("Dragging with number of interpolating steps:")
            max_width = len(str(self.drag_interp_steps))
            self.mpi_info("* %" + "%d" % max_width + "d : %r", 1, self.slow_blocks)
            self.mpi_info("* %" + "%d" % max_width + "d : %r",
                          self.drag_interp_steps, self.fast_blocks)
        elif np.any(np.array(self.oversampling_factors) > 1):
            self.mpi_info("Oversampling with factors:")
            max_width = len(str(max(self.oversampling_factors)))
            for f, b in zip(self.oversampling_factors, self.blocks):
                self.mpi_info("* %" + "%d" % max_width + "d : %r", f, b)
            if self.oversample_thin:
                self.current_point.output_thin = int(np.round(sum(
                    len(b) * o for b, o in zip(self.blocks, self.oversampling_factors)) /
                                                              self.model.prior.d()))

        # Save blocking in updated info, in case we want to resume
        self._updated_info["blocking"] = list(zip(self.oversampling_factors, self.blocks))
        sampled_params_list = list(self.model.parameterization.sampled_params())
        blocks_indices = [[sampled_params_list.index(p) for p in b] for b in self.blocks]
        self.proposer = BlockedProposer(
            blocks_indices, oversampling_factors=self.oversampling_factors,
            i_last_slow_block=(self.i_last_slow_block if self.drag else None),
            proposal_scale=self.proposal_scale)
        # Cycle length, taking into account oversampling/dragging
        if self.drag:
            self.cycle_length = self.n_slow
        else:
            self.cycle_length = sum(len(b) * o for b, o in
                                    zip(blocks_indices, self.oversampling_factors))
        self.log.debug(
            "Cycle length in steps: %r", self.cycle_length)
        for number in self._quants_d_units:
            number.set_scale(self.cycle_length // self.current_point.output_thin)

    def set_proposer_covmat(self, load=False):
        if load:
            # Build the initial covariance matrix of the proposal, or load from checkpoint
            self._covmat, where_nan = self._load_covmat(
                prefer_load_old=self.output.is_resuming())
            if np.any(where_nan) and self.learn_proposal:
                # We want to start learning the covmat earlier.
                self.mpi_info("Covariance matrix " +
                              ("not present" if np.all(where_nan) else "not complete") +
                              ". We will start learning the covariance of the proposal "
                              "earlier: R-1 = %g (would be %g if all params loaded).",
                              self.learn_proposal_Rminus1_max_early,
                              self.learn_proposal_Rminus1_max)
                self.learn_proposal_Rminus1_max = self.learn_proposal_Rminus1_max_early
            self.log.debug(
                "Sampling with covmat:\n%s",
                DataFrame(self._covmat,
                          columns=self.model.parameterization.sampled_params(),
                          index=self.model.parameterization.sampled_params()).to_string(
                    line_width=_line_width))
        self.proposer.set_covariance(self._covmat)

    def _get_last_nondragging_block(self, blocks, speeds):
        # blocks and speeds are already sorted
        log_differences = np.zeros(len(blocks) - 1)
        for i in range(len(blocks) - 1):
            log_differences[i] = (np.log(np.min(speeds[:i + 1])) -
                                  np.log(np.min(speeds[i + 1:])))
        i_max = np.argmin(log_differences)
        return i_max

    def _run(self):
        """
        Runs the sampler.
        """
        self.mpi_info(
            "Sampling!" +
            (" (NB: no accepted step will be saved until %d burn-in samples " %
             self.burn_in.value + "have been obtained)"
             if self.burn_in.value else ""))
        self.n_steps_raw = 0
        last_output = 0
        last_n = self.n()
        while last_n < self.max_samples and not self.converged:
            self.get_new_sample()
            self.n_steps_raw += 1
            if self.output_every.unit:
                # if output_every in sec, print some info and dump at fixed time intervals
                now = datetime.datetime.now()
                now_sec = now.timestamp()
                if now_sec >= last_output + self.output_every.value:
                    self.do_output(now)
                    last_output = now_sec
            if self.current_point.weight == 1:
                # have added new point
                # Callback function
                n = self.n()
                if n != last_n:
                    # and actually added
                    last_n = n
                    if (hasattr(self, "callback_function_callable") and
                            not (max(n, 1) % self.callback_every.value) and
                            self.current_point.weight == 1):
                        self.callback_function_callable(self)
                        self.last_point_callback = len(self.collection)
                    # Checking convergence and (optionally) learning
                    # the covmat of the proposal
                    if self.check_all_ready():
                        self.check_convergence_and_learn_proposal()
                        if is_main_process():
                            self.i_learn += 1
        if last_n == self.max_samples:
            self.log.info("Reached maximum number of accepted steps allowed. "
                          "Stopping.")
        # Make sure the last batch of samples ( < output_every (not in sec)) are written
        self.collection.out_update()
        if more_than_one_process():
            Ns = (lambda x: np.array(get_mpi_comm().gather(x)))(self.n())
            if not is_main_process():
                Ns = []
        else:
            Ns = [self.n()]
        self.mpi_info("Sampling complete after %d accepted steps.", sum(Ns))

    def n(self, burn_in=False):
        """
        Returns the total number of accepted steps taken, including or not burn-in steps
        depending on the value of the `burn_in` keyword.
        """
        return len(self.collection) + (0 if not burn_in
                                       else self.burn_in.value - self.burn_in_left //
                                            self.current_point.output_thin + 1)

    def get_new_sample_metropolis(self):
        """
        Draws a new trial point from the proposal pdf and checks whether it is accepted:
        if it is accepted, it saves the old one into the collection and sets the new one
        as the current state; if it is rejected increases the weight of the current state
        by 1.

        Returns:
           ``True`` for an accepted step, ``False`` for a rejected one.
        """
        trial = self.current_point.values.copy()
        self.proposer.get_proposal(trial)
        try:
            logpost_trial, logprior_trial, loglikes_trial, derived = \
                self.model.logposterior(trial)
        except:
            self.send_error_signal()
            raise
        accept = self.metropolis_accept(logpost_trial, self.current_point.logpost)
        self.process_accept_or_reject(accept, trial, derived,
                                      logpost_trial, logprior_trial, loglikes_trial)
        return accept

    def get_new_sample_dragging(self):
        """
        Draws a new trial point in the slow subspace, and gets the corresponding trial
        in the fast subspace by "dragging" the fast parameters.
        Finally, checks the acceptance of the total step using the "dragging" pdf:
        if it is accepted, it saves the old one into the collection and sets the new one
        as the current state; if it is rejected increases the weight of the current state
        by 1.

        Returns:
           ``True`` for an accepted step, ``False`` for a rejected one.
        """
        # Prepare starting and ending points *in the SLOW subspace*
        # "start_" and "end_" mean here the extremes in the SLOW subspace
        start_slow_point = self.current_point.values.copy()
        start_slow_logpost = self.current_point.logpost
        end_slow_point = start_slow_point.copy()
        self.proposer.get_proposal_slow(end_slow_point)
        self.log.debug("Proposed slow end-point: %r", end_slow_point)
        # Save derived parameters of delta_slow jump, in case I reject all the dragging
        # steps but accept the move in the slow direction only
        end_slow_logpost, end_slow_logprior, end_slow_loglikes, derived = (
            self.model.logposterior(end_slow_point))
        if end_slow_logpost == -np.inf:
            self.current_point.weight += 1
            return False
        # trackers of the dragging
        current_start_point = start_slow_point
        current_end_point = end_slow_point
        current_start_logpost = start_slow_logpost
        current_end_logpost = end_slow_logpost
        current_end_logprior = end_slow_logprior
        current_end_loglikes = end_slow_loglikes
        # accumulators for the "dragging" probabilities to be metropolis-tested
        # at the end of the interpolation
        start_drag_logpost_acc = start_slow_logpost
        end_drag_logpost_acc = end_slow_logpost
        # start dragging
        for i_step in range(1, 1 + self.drag_interp_steps):
            self.log.debug("Dragging step: %d", i_step)
            # take a step in the fast direction in both slow extremes
            delta_fast = np.zeros(len(current_start_point))
            self.proposer.get_proposal_fast(delta_fast)
            self.log.debug("Proposed fast step delta: %r", delta_fast)
            proposal_start_point = current_start_point + delta_fast
            proposal_end_point = current_end_point + delta_fast
            # get the new extremes for the interpolated probability
            # (reject if any of them = -inf; avoid evaluating both if just one fails)
            # Force the computation of the (slow blocks) derived params at the starting
            # point, but discard them, since they contain the starting point's fast ones,
            # not used later -- save the end point's ones.
            proposal_start_logpost = self.model.logposterior(proposal_start_point)[0]
            (proposal_end_logpost, proposal_end_logprior, proposal_end_loglikes,
             derived_proposal_end) = (self.model.logposterior(proposal_end_point)
                                      if proposal_start_logpost > -np.inf
                                      else (-np.inf, None, [], []))
            if proposal_start_logpost > -np.inf and proposal_end_logpost > -np.inf:
                # create the interpolated probability and do a Metropolis test
                frac = i_step / (1 + self.drag_interp_steps)
                proposal_interp_logpost = ((1 - frac) * proposal_start_logpost
                                           + frac * proposal_end_logpost)
                current_interp_logpost = ((1 - frac) * current_start_logpost
                                          + frac * current_end_logpost)
                accept_drag = self.metropolis_accept(proposal_interp_logpost,
                                                     current_interp_logpost)
            else:
                accept_drag = False
            self.log.debug("Dragging step: %s",
                           ("accepted" if accept_drag else "rejected"))
            # If the dragging step was accepted, do the drag
            if accept_drag:
                current_start_point = proposal_start_point
                current_start_logpost = proposal_start_logpost
                current_end_point = proposal_end_point
                current_end_logpost = proposal_end_logpost
                current_end_logprior = proposal_end_logprior
                current_end_loglikes = proposal_end_loglikes
                derived = derived_proposal_end
            # In any case, update the dragging probability for the final metropolis test
            start_drag_logpost_acc += current_start_logpost
            end_drag_logpost_acc += current_end_logpost
        # Test for the TOTAL step
        accept = self.metropolis_accept(end_drag_logpost_acc / self.drag_interp_steps,
                                        start_drag_logpost_acc / self.drag_interp_steps)
        self.process_accept_or_reject(
            accept, current_end_point, derived,
            current_end_logpost, current_end_logprior, current_end_loglikes)
        self.log.debug("TOTAL step: %s", ("accepted" if accept else "rejected"))
        return accept

    def metropolis_accept(self, logp_trial, logp_current):
        """
        Symmetric-proposal Metropolis-Hastings test.

        Returns:
           ``True`` or ``False``.
        """
        if logp_trial == -np.inf:
            return False
        elif logp_trial > logp_current:
            return True
        else:
            return np.random.exponential() > (logp_current - logp_trial)

    def process_accept_or_reject(self, accept_state, trial=None, derived=None,
                                 logpost_trial=None, logprior_trial=None,
                                 loglikes_trial=None):
        """Processes the acceptance/rejection of the new point."""
        if accept_state:
            # add the old point to the collection (if not burning or initial point)
            if self.burn_in_left <= 0:
                if self.current_point.add_to_collection(self.collection):
                    self.log.debug("New sample, #%d: \n   %s",
                                   self.n(), self.current_point)
                    # Update chain files, if output_every *not* in sec
                    if not self.output_every.unit:
                        if self.n() % self.output_every.value == 0:
                            self.collection.out_update()
            else:
                self.burn_in_left -= 1
                self.log.debug("Burn-in sample:\n   %s", self.current_point)
                if self.burn_in_left == 0 and self.burn_in:
                    self.log.info("Finished burn-in phase: discarded %d accepted steps.",
                                  self.burn_in.value)
            # set the new point as the current one, with weight one
            self.current_point.add(trial, derived=derived, logpost=logpost_trial,
                                   logpriors=logprior_trial, loglikes=loglikes_trial)
        else:  # not accepted
            self.current_point.weight += 1
            # Failure criterion: chain stuck! (but be more permissive during burn_in)
            max_tries_now = self.max_tries.value * \
                            (1 + (10 - 1) * np.sign(self.burn_in_left))
            if self.current_point.weight > max_tries_now:
                self.collection.out_update()
                self.send_error_signal()
                raise LoggedError(
                    self.log,
                    "The chain has been stuck for %d attempts. Stopping sampling. "
                    "If this has happened often, try improving your "
                    "reference point/distribution. Alternatively (though not advisable) "
                    "make 'max_tries: np.inf' (or 'max_tries: .inf' in yaml).\n"
                    "Current point: %s", max_tries_now, self.current_point)

    # Functions to check convergence and learn the covariance of the proposal distribution

    def check_all_ready(self):
        """
        Checks if the chain(s) is(/are) ready to check convergence and, if requested,
        learn a new covariance matrix for the proposal distribution.
        """
        msg_ready = ("Ready to check convergence" +
                     (" and learn a new proposal covmat"
                      if self.learn_proposal else ""))
        n = len(self.collection)
        # If *just* (weight==1) got ready to check+learn
        if not (n % self.learn_every.value) and n > 0:
            self.log.info("Learn + convergence test @ %d samples accepted.", n)
            if more_than_one_process():
                self.been_waiting += 1
                if self.been_waiting > self.max_waiting:
                    self.send_error_signal()
                    raise LoggedError(
                        self.log, "Waiting for too long for all chains to be ready. "
                                  "Maybe one of them is stuck or died unexpectedly?")
            self.model.dump_timing()
            # If not MPI size > 1, we are ready
            if not more_than_one_process():
                self.log.debug(msg_ready)
                return True
            # Error check in case any process already sent an error signal
            self.check_error_signal()
            # If MPI, tell the rest that we are ready -- we use a "gather"
            # ("reduce" was problematic), but we are in practice just pinging
            if not hasattr(self, "req"):  # just once!
                self.all_ready = np.empty(get_mpi_size())
                self.req = get_mpi_comm().Iallgather(
                    np.array([1.]), self.all_ready)
                self.log.info(msg_ready + " (waiting for the rest...)")
        # If all processes are ready to learn (= communication finished)
        if self.req.Test() if hasattr(self, "req") else False:
            # Sanity check: actually all processes have finished
            assert np.all(self.all_ready == 1), (
                "This should not happen! Notify the developers. (Got %r)", self.all_ready)
            if more_than_one_process() and is_main_process():
                self.log.info("All chains are r" + msg_ready[1:])
            delattr(self, "req")
            self.been_waiting = 0
            # Another error check, in case the error occurred after sending "ready" signal
            self.check_error_signal()
            # Just in case, a barrier here
            sync_processes()
            return True
        return False

    def check_convergence_and_learn_proposal(self):
        """
        Checks the convergence of the sampling process, and, if requested,
        learns a new covariance matrix for the proposal distribution from the covariance
        of the last samples.
        """
        if more_than_one_process():
            # Compute and gather means, covs and CL intervals of last half of chains
            use_first = int(self.n() / 2)
            mean = self.collection.mean(first=use_first)
            cov = self.collection.cov(first=use_first)
            mcsamples = self.collection._sampled_to_getdist_mcsamples(first=use_first)
            try:
                bound = np.array([[
                    mcsamples.confidence(i, limfrac=self.Rminus1_cl_level / 2.,
                                         upper=which)
                    for i in range(self.model.prior.d())] for which in [False, True]]).T
                success_bounds = True
            except:
                bound = None
                success_bounds = False
            Ns, means, covs, bounds, acceptance_rates = map(
                lambda x: np.array(get_mpi_comm().gather(x)),
                [self.n(), mean, cov, bound, self.acceptance_rate])
        else:
            # Compute and gather means, covs and CL intervals of last m-1 chain fractions
            m = 1 + self.Rminus1_single_split
            cut = int(len(self.collection) / m)
            try:
                Ns = (m - 1) * [cut]
                means = np.array(
                    [self.collection.mean(first=i * cut, last=(i + 1) * cut - 1) for i in
                     range(1, m)])
                covs = np.array(
                    [self.collection.cov(first=i * cut, last=(i + 1) * cut - 1) for i in
                     range(1, m)])
                mcsamples_list = [
                    self.collection._sampled_to_getdist_mcsamples(
                        first=i * cut, last=(i + 1) * cut - 1)
                    for i in range(1, m)]
            except:
                self.log.info("Not enough points in chain to check convergence. "
                              "Waiting for next checkpoint.")
                return
            acceptance_rates = self.acceptance_rate
            try:
                bounds = [np.array(
                    [[mcs.confidence(i, limfrac=self.Rminus1_cl_level / 2., upper=which)
                      for i in range(self.model.prior.d())] for which in [False, True]]).T
                          for mcs in mcsamples_list]
                success_bounds = True
            except:
                bounds = None
                success_bounds = False
        # Compute convergence diagnostics
        if is_main_process():
            self.progress.at[self.i_learn, "N"] = (
                sum(Ns) if more_than_one_process() else self.n())
            self.progress.at[self.i_learn, "timestamp"] = \
                datetime.datetime.now().isoformat()
            acceptance_rate = (
                np.average(acceptance_rates, weights=Ns)
                if more_than_one_process() else acceptance_rates)
            self.log.info(" - Acceptance rate: %.3f" +
                          (" = avg(%r)" % list(acceptance_rates)
                           if more_than_one_process() else ""),
                          acceptance_rate)
            self.progress.at[self.i_learn, "acceptance_rate"] = acceptance_rate
            # "Within" or "W" term -- our "units" for assessing convergence
            # and our prospective new covariance matrix
            mean_of_covs = np.average(covs, weights=Ns, axis=0)
            # "Between" or "B" term
            # We don't weight with the number of samples in the chains here:
            # shorter chains will likely be outliers, and we want to notice them
            cov_of_means = np.atleast_2d(np.cov(means.T))  # , fweights=Ns)
            # For numerical stability, we turn mean_of_covs into correlation matrix:
            #   rho = (diag(Sigma))^(-1/2) * Sigma * (diag(Sigma))^(-1/2)
            # and apply the same transformation to the mean of covs (same eigenvals!)
            # NB: disables warnings from numpy
            prev_err_state = deepcopy(np.geterr())
            np.seterr(divide="ignore")
            diagSinvsqrt = np.diag(np.power(np.diag(cov_of_means), -0.5))
            np.seterr(**prev_err_state)
            corr_of_means = diagSinvsqrt.dot(cov_of_means).dot(diagSinvsqrt)
            norm_mean_of_covs = diagSinvsqrt.dot(mean_of_covs).dot(diagSinvsqrt)
            success = False
            # Cholesky of (normalized) mean of covs and eigvals of Linv*cov_of_means*L
            try:
                L = np.linalg.cholesky(norm_mean_of_covs)
            except np.linalg.LinAlgError:
                self.log.warning(
                    "Negative covariance eigenvectors. "
                    "This may mean that the covariance of the samples does not "
                    "contain enough information at this point. "
                    "Skipping learning a new covmat for now.")
            else:
                Linv = np.linalg.inv(L)
                # Suppress numpy warnings (restored later in this function)
                error_handling = deepcopy(np.geterr())
                np.seterr(all="ignore")
                try:
                    eigvals = np.linalg.eigvalsh(Linv.dot(corr_of_means).dot(Linv.T))
                    success = True
                except np.linalg.LinAlgError:
                    self.log.warning("Could not compute eigenvalues. "
                                     "Skipping learning a new covmat for now.")
                else:
                    Rminus1 = max(np.abs(eigvals))
                    self.progress.at[self.i_learn, "Rminus1"] = Rminus1
                    # For real square matrices, a possible def of the cond number is:
                    condition_number = Rminus1 / min(np.abs(eigvals))
                    self.log.debug(" - Condition number = %g", condition_number)
                    self.log.debug(" - Eigenvalues = %r", eigvals)
                    self.log.info(
                        " - Convergence of means: R-1 = %f after %d accepted steps" % (
                            Rminus1, (sum(Ns) if more_than_one_process() else self.n())) +
                        (" = sum(%r)" % list(Ns) if more_than_one_process() else ""))
                    # Have we converged in means?
                    # (criterion must be fulfilled twice in a row)
                    if max(Rminus1, self.Rminus1_last) < self.Rminus1_stop:
                        # Check the convergence of the bounds of the confidence intervals
                        # Same as R-1, but with the rms deviation from the mean bound
                        # in units of the mean standard deviation of the chains
                        if success_bounds:
                            Rminus1_cl = (np.std(bounds, axis=0).T /
                                          np.sqrt(np.diag(mean_of_covs)))
                            self.log.debug(" - normalized std's of bounds = %r",
                                           Rminus1_cl)
                            Rminus1_cl = np.max(Rminus1_cl)
                            self.progress.at[self.i_learn, "Rminus1_cl"] = Rminus1_cl
                            self.log.info(
                                " - Convergence of bounds: R-1 = %f after %d " % (
                                    Rminus1_cl,
                                    (sum(Ns) if more_than_one_process() else self.n())) +
                                "accepted steps" +
                                (" = sum(%r)" % list(
                                    Ns) if more_than_one_process() else ""))
                            if Rminus1_cl < self.Rminus1_cl_stop:
                                self.converged = True
                                self.log.info("The run has converged!")
                            self._Ns = Ns
                        else:
                            self.log.info("Computation of the bounds was not possible. "
                                          "Waiting until the next converge check.")
                np.seterr(**error_handling)
        else:
            mean_of_covs = np.empty((self.model.prior.d(), self.model.prior.d()))
            success = None
            Rminus1 = None
        # Broadcast and save the convergence status and the last R-1 of means
        success = share_mpi(success)
        if success:
            self.Rminus1_last, self.converged = share_mpi(
                (Rminus1, self.converged) if is_main_process() else None)
            # Do we want to learn a better proposal pdf?
            if self.learn_proposal and not self.converged:
                good_Rminus1 = (self.learn_proposal_Rminus1_max >
                                self.Rminus1_last > self.learn_proposal_Rminus1_min)
                if not good_Rminus1:
                    self.mpi_info("Convergence less than requested for updates: "
                                  "waiting until the next convergence check.")
                    return
                if more_than_one_process():
                    get_mpi_comm().Bcast(mean_of_covs, root=0)
                else:
                    mean_of_covs = covs[0]
                try:
                    self.proposer.set_covariance(mean_of_covs)
                    if is_main_process():
                        self.log.info(" - Updated covariance matrix of proposal pdf.")
                        self.log.debug("%r", mean_of_covs)
                except:
                    if is_main_process():
                        self.log.debug("Updating covariance matrix failed unexpectedly. "
                                       "waiting until next covmat learning attempt.")
        # Save checkpoint info
        self.write_checkpoint()

    def send_error_signal(self):
        """
        Sends an error signal to the other MPI processes.
        """
        for i_rank in range(get_mpi_size()):
            if i_rank != get_mpi_rank():
                get_mpi_comm().isend(True, dest=i_rank, tag=_error_tag)

    def check_error_signal(self):
        """
        Checks if any of the other process has sent an error signal, and fails.

        NB: This behaviour only shows up when running this sampler inside a Python script,
            not when running with `cobaya run` (in that case, the process raising an error
            will call `MPI_ABORT` and kill the rest.
        """
        for i in range(get_mpi_size()):
            if i != get_mpi_rank():
                from mpi4py import MPI
                status = MPI.Status()
                get_mpi_comm().iprobe(i, status=status)
                if status.tag == _error_tag:
                    raise LoggedError(self.log, "Another process failed! Exiting.")

    def do_output(self, date_time):
        self.collection.out_update()
        msg = "Progress @ %s : " % date_time.strftime("%Y-%m-%d %H:%M:%S")
        msg += "%d steps taken" % self.n_steps_raw
        if self.burn_in_left and self.burn_in:  # NB: burn_in_left = 1 even if no burn_in
            msg += " -- still burning in, %d accepted steps left." % self.burn_in_left
        else:
            msg += ", and %d accepted." % self.n()
        self.log.info(msg)

    def write_checkpoint(self):
        if is_main_process() and self.output:
            checkpoint_filename = self.checkpoint_filename()
            self.dump_covmat(self.proposer.get_covariance())
            checkpoint_info = {kinds.sampler: {self.get_name(): dict([
                ("converged", bool(self.converged)),
                ("Rminus1_last", self.Rminus1_last),
                ("burn_in", (self.burn_in.value  # initial: repeat burn-in if not finished
                             if not self.n() and self.burn_in_left else
                             0)),  # to avoid overweighting last point of prev. run
                ("mpi_size", get_mpi_size())])}}
            yaml_dump_file(checkpoint_filename, checkpoint_info, error_if_exists=False)
            if not self.progress.empty:
                with open(self.progress_filename(), "a",
                          encoding="utf-8") as progress_file:
                    progress_file.write(
                        self.progress.tail(1).to_string(header=False, index=False) + "\n")
            self.log.debug("Dumped checkpoint and progress info, and current covmat.")

    # Finally: returning the computed products ###########################################

    def products(self):
        """
        Auxiliary function to define what should be returned in a scripted call.

        Returns:
           The sample ``Collection`` containing the accepted steps.
        """
        products = {"sample": self.collection}
        if is_main_process():
            products["progress"] = self.progress
        return products

    # Class methods
    @classmethod
    def output_files_regexps(cls, output, info=None, minimal=False):
        regexps = [output.collection_regexp(name=None)]
        if minimal:
            return [(r, None) for r in regexps]
        regexps += [
            re.compile(output.prefix_regexp_str + re.escape(ext.lstrip(".")) + "$")
            for ext in [_checkpoint_extension, _progress_extension, _covmat_extension]]
        return [(r, None) for r in regexps]

    @classmethod
    def get_version(cls):
        return __version__

    @classmethod
    def _get_desc(cls, info=None):
        if info is None:
            drag = None
        else:
            drag = info.get("drag", cls.get_defaults()["drag"])
        drag_string = {
            True: r" using the fast-dragging procedure described in \cite{Neal:2005}",
            False: ""}
        # Unknown case (no info passed)
        drag_string[None] = " [(if drag: True)%s]" % drag_string[True]
        return ("Adaptive, speed-hierarchy-aware MCMC sampler (adapted from CosmoMC) "
                r"\cite{Lewis:2002ah,Lewis:2013hha}" + drag_string[drag] + ".")
def t_d_df(some_df: DataFrame) -> DataFrame:
    assert isinstance(some_df, DataFrame)
    return some_df.tail(1)
Exemple #19
0
#coding:utf-8
from pandas import DataFrame
import pandas as pd
timeAll = DataFrame(pd.date_range('8/3/2016', periods=252, freq='1d'))
print(timeAll.head())

timeAll.columns = ['day']
print(timeAll.tail())

basedata = DataFrame(columns=['day', 'sale', 'price', 'skuid'])
print(basedata)
#随机划分 训练集 和测试集
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_trans,
                                                    y,
                                                    random_state=1,
                                                    test_size=0.4,
                                                    train_size=0.6)
Exemple #20
0
    def strategy_sma_crossover(self, short_term_sma_dataframe: pd.DataFrame,
                               long_term_sma_dataframe: pd.DataFrame,
                               short_term_moving_average: int,
                               long_term_moving_average: int,
                               sma_moving_average_lookback_time_period: int):

        print('Analyzing Simple Moving Average Strategy...')

        #Find stocks in which the short term moving average has crossed above the long term moving average within the most recent 20 days
        short_term_sma_dataframe = short_term_sma_dataframe.tail(
            sma_moving_average_lookback_time_period)
        long_term_sma_dataframe = long_term_sma_dataframe.tail(
            sma_moving_average_lookback_time_period)

        sma_cross_df1 = pd.merge(short_term_sma_dataframe,
                                 long_term_sma_dataframe,
                                 left_index=True,
                                 right_index=True)

        sma_cross_up_ticker_name_list = []
        sma_cross_down_ticker_name_list = []

        for ticker in self.sp_500_symbols_list:
            sma_cross_df2 = sma_cross_df1.filter(regex=rf"^{ticker}_")

            #Detect moving average cross up event or cross down event
            #https://stackoverflow.com/questions/28345261/python-and-pandas-moving-average-crossover
            #Shift the column values down one row. The last value is removed and the top column becomes NaN.
            previous_short_term_moving_average_df1 = sma_cross_df2[
                f'{ticker}_{short_term_moving_average}_sma'].shift(1)

            previous_long_term_moving_average_df1 = sma_cross_df2[
                f'{ticker}_{long_term_moving_average}_sma'].shift(1)

            short_term_sma_cross_up_df1 = (
                (sma_cross_df2[f'{ticker}_{short_term_moving_average}_sma'] >
                 sma_cross_df2[f'{ticker}_{long_term_moving_average}_sma']) &
                (previous_short_term_moving_average_df1 <=
                 previous_long_term_moving_average_df1))

            #Notice the greater than and lesser than symbols are reversed for the cross down dataframe
            short_term_sma_cross_down_df1 = (
                (sma_cross_df2[f'{ticker}_{short_term_moving_average}_sma'] <
                 sma_cross_df2[f'{ticker}_{long_term_moving_average}_sma']) &
                (previous_short_term_moving_average_df1 >=
                 previous_long_term_moving_average_df1))

            #Count the number of cross up events by counting the number of True values in the boolean array
            number_of_cross_up_events = np.count_nonzero(
                short_term_sma_cross_up_df1)
            number_of_cross_down_events = np.count_nonzero(
                short_term_sma_cross_down_df1)

            #If there is 1 or more True values (i.e., cross up events), then append the name of the ticker to sma_cross_up_ticker_name_list
            if number_of_cross_up_events >= 1:
                sma_cross_up_ticker_name_list.append(ticker)
            #Else if (elif) there is 1 or more True values (i.e., cross down events), then append the name of the ticker to sma_cross_down_ticker_name_list
            elif number_of_cross_down_events >= 1:
                sma_cross_down_ticker_name_list.append(ticker)
            else:
                pass

        print('Analyzing Simple Moving Average Strategy...DONE')

        return sma_cross_up_ticker_name_list, sma_cross_down_ticker_name_list
Exemple #21
0
               index=list('abcd'),
               columns=['one', 'two'])
print(df)
print(df.sum())
print(df.sum(axis=1))  # axis=1将按行计算,axis=0将按列计算
print(df.mean(axis=1, skipna=False))
print(df.describe())

obj = Series(['a', 'a', 'b', 'c'] * 4)

print(obj)
print(obj.describe())

all_data = {}

for ticker in ['AAP', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')

price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()})
volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()})

returns = price.pct_change()
print(price.tail())

data = DataFrame({
    'Qu1': [1, 3, 4, 3, 4],
    'Qu2': [2, 3, 1, 2, 3],
    'Qu3': [1, 5, 2, 4, 4]
})
print(data)
Exemple #22
0
iris_target = iris.target

# 格式整理
iris_target.shape = (150, 1)
iris_all = np.hstack((iris_data, iris_target))
# 转化为DataFrame
iris_data_df = DataFrame(iris_data, columns=feature_names)
iris_target_df = DataFrame(iris_target, columns=['target'])
iris_data_all_df = DataFrame(iris_all, columns=feature_names + ['target'])
'''
数据集基础信息的获取[以iris_data_df为例]
'''

# 数据预览
print(iris_data_all_df.head())  # 默认为前5行
print(iris_data_all_df.tail())  # 默认为后5行
print(iris_data_all_df.sample(5))  # 随机抽取5行

# 数据描述
'''
这里是处理好的数据集,所以数据格式比较完整,不用进一步的处理。
如有数据乱码或者出现缺失值等情况,我们当按照上一篇的方法进行适当的数据清洗。
'''

# print(iris_data_all_df.isnull().sum())  # 缺失值
print(iris_data_all_df.shape)  # 大小
print(iris_data_all_df.dtypes)  # 类型
print(iris_data_all_df.describe())  # 常见统计量的描述
print(iris_data_all_df.info())  # 多种信息
'''
可视化的方法,来直观了解数据
prop_c_male('Leslie').plot()

# <codecell>

# I couldn't figure out a way of iterating over the names rather than names/sex combo in
# a vectorized way.  

from itertools import islice

names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None))

m = [(name_, ambi_names_pt[name_]['M']/(ambi_names_pt[name_]['F'] + ambi_names_pt[name_]['M']))  \
     for name_ in names_to_calc]
p_m_instant = DataFrame(dict(m))
p_m_instant.tail()

# <codecell>

# similar calculation except instead of looking at the proportions for a given year only,
# we look at the cumulative number of male/female babies for given name

from itertools import islice

names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None))

m = [(name_, ambi_names_pt[name_]['M'].cumsum()/(ambi_names_pt[name_]['F'].cumsum() + ambi_names_pt[name_]['M'].cumsum()))  \
     for name_ in names_to_calc]
p_m_cum = DataFrame(dict(m))
p_m_cum.tail()
Exemple #24
0
 def label(self, data: pd.DataFrame):
     # data.ta.sma(length=20, append=True)
     # data.ta.macd(append=True)
     # data.ta.percent_return(cumulative=True, append=True)
     data.ta.strategy(self.tastrat, append=True)
     print(data.tail(50))
Exemple #25
0
def GetFollowsByCode_InFiles(filelist, code='SH600036'):
    global codemarket
    # print filelist
    code = CodeName_process(code)
    print 'code:', code
    name, follows_list = GetFollows_InFiles(filelist, code)
    print name.decode('gbk')
    csvfilename = get_stock_history_csv(code, name.decode('gbk'))
    print csvfilename
    if csvfilename == '':
        print 'csv file not found. exit.'
        return
    # print 'follows_list:', follows_list
    follows_chg_list = GetFollows_ProcessList(follows_list, csvfilename)
    xdata = zip(*follows_chg_list)[0]  #get DataFrame from List
    df = DataFrame(follows_chg_list,
                   index=xdata,
                   columns=['DATE', 'CHG', 'CHG_PCT', 'PRICE', 'VOLUME'])
    # print df
    print df.tail(20)
    # print len(df)
    # print df.CHG.describe()
    CHG_mean = df.CHG.mean()
    print 'CHG_mean', CHG_mean
    # print [CHG_mean for x in range(10)]
    # return  #####
    # fig = plt.figure(figsize=(16,9))
    # fig, (ax0, ax1) = plt.subplots(nrows=2, figsize=(16,9))
    fig = plt.figure(figsize=(16, 8.5))
    ax0 = fig.add_axes((0.1, 0.2, 0.8, 0.7))  #[left, bottom, width, height]

    # ax_left = ax0
    ax_left = df.CHG.plot(ax=ax0,
                          kind='bar',
                          alpha=0.5,
                          align='center',
                          linewidth=2)
    ax0.plot([CHG_mean for x in range(len(df))], 'g--', linewidth=2)
    ax_left.set_ylabel('f')
    ax_right = df.PRICE.plot(ax=ax0,
                             secondary_y=True,
                             color='red',
                             marker='v',
                             linewidth=2,
                             alpha=0.7)
    ax_right.set_ylabel('price')

    if codemarket == 0:
        value_str = GetStockInfo_fromFile(
            csv.reader(file('stockinfo_cn.csv', 'rb')), code).decode('gbk')
        plt.title(name.decode('gbk') + code + ' v' + value_str)
    else:
        plt.title(name.decode('gbk') + code)
    plt.xlabel('Date')
    # print type(plt.xlim())
    # print type(xdata), xdata, xdata[0]
    list, listlabel = GetXticksList(xdata)
    ax_left.set_xticks(list)
    ax_left.set_xticklabels([])  #(listlabel, fontsize='small')
    # plt.legend()
    # fig.autofmt_xdate()
    # ax1.set_title('volume')
    # plt.subplot(223, axisbg='r')
    ax1 = fig.add_axes((0.1, 0.05, 0.8, 0.15), sharex=ax0)

    ax_volume = df.VOLUME.plot(ax=ax1,
                               kind='bar',
                               color='green',
                               linewidth=1,
                               alpha=0.7)
    ax_volume.set_xticklabels([])
    ax_volume.set_xticklabels(listlabel, fontsize='small')
    ax_volume.set_xticks(list)
    ax_volume.set_ylabel('volume')
    ax1.plot([df.VOLUME.mean() for x in range(len(df))], 'g--', linewidth=2)

    # fig.subplots_adjust(bottom=0.8)
    # cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
    # fig.colorbar(im, cax=cbar_ax)

    if not savepng:
        plt.show()
    else:
        fig.savefig(save_fname)  #, dpi=140)
def GetFollowsByCode_InFiles(filelist, code = 'SH600036'):
    global codemarket
    global titleprefix
    # print filelist
    code = CodeName_process(code)
    print 'code:', code
    name, follows_list = GetFollows_InFiles(filelist, code)   
    #print follows_list[-5:-1]
    print name #.decode('gbk')
    pricehistory = get_stock_history_csv(code, name)
    if pricehistory == []:
        print 'Get pricehistory failed. Exit.'
        return
    # print 'follows_list:', follows_list
    follows_chg_list = GetFollows_ProcessList(follows_list, pricehistory) 
    xdata = zip(*follows_chg_list)[0]   #get DataFrame from List
    df = DataFrame(follows_chg_list, index=xdata, columns=['DATE', 'CHG', 'CHG_PCT', 'PRICE', 'VOLUME'])
    # print df
    print df.tail(20)
    # print len(df)
    # print df.CHG.describe()
    CHG_mean = df.CHG.mean()
    print 'CHG_mean', CHG_mean
    # print [CHG_mean for x in range(10)]
    # return  #####
    # fig = plt.figure(figsize=(16,9))
    # fig, (ax0, ax1) = plt.subplots(nrows=2, figsize=(16,9))
    fig = plt.figure(figsize=(16,8.5))
    ax0 = fig.add_axes((0.1, 0.2, 0.8, 0.7))     #[left, bottom, width, height]
    
    # ax_left = ax0
    ax_left = df.CHG.plot(ax=ax0, kind='bar', alpha=0.5, align='center', linewidth=2)
    ax0.plot([CHG_mean for x in range(len(df))], 'g--', linewidth=2)
    ax_left.set_ylabel('f')
    ax_right = df.PRICE.plot(ax=ax0, secondary_y=True, color='red', marker='v', linewidth=2, alpha=0.7)
    ax_right.set_ylabel('price')
    
    value_str = str(get_stockinfo_volume(code)[0])+u'亿'
    follow_str = str(df.CHG[-1])+'/'+ str(int(CHG_mean))
    plt.title(df.DATE[-1]+' '+titleprefix+' '+name+code+' v'+value_str+' F'+follow_str)
    plt.xlabel('Date')
    # print type(plt.xlim())
    # print type(xdata), xdata, xdata[0]
    list, listlabel = GetXticksList(xdata)
    ax_left.set_xticks(list)
    ax_left.set_xticklabels([]) #(listlabel, fontsize='small')
    # plt.legend()
    # fig.autofmt_xdate()
    # ax1.set_title('volume')
    # plt.subplot(223, axisbg='r')
    ax1 = fig.add_axes((0.1, 0.05, 0.8, 0.15), sharex=ax0)
    
    ax_volume = df.VOLUME.plot(ax=ax1, kind='bar', color='green', linewidth=1, alpha=0.7)
    ax_volume.set_xticklabels([])
    ax_volume.set_xticklabels(listlabel, fontsize='small')
    ax_volume.set_xticks(list)
    ax_volume.set_ylabel('volume')
    ax1.plot([df.VOLUME.mean() for x in range(len(df))], 'g--', linewidth=2)
    
    # fig.subplots_adjust(bottom=0.8)
    # cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
    # fig.colorbar(im, cax=cbar_ax)
    
    if not savepng:
        plt.show()
    else:
        fig.savefig(save_fname) #, dpi=140)
Exemple #27
0
    df_train_label = pd.read_csv(
        join(dir_path, 'labels_train.csv'), header=0, index_col=0)

    df_train_label = df_train_label.loc[:, df_train_label.columns.intersection([
        ticker_name])]
    df_train_label.columns = ['Y']

    df_complete = pd.merge(merged_df, df_train_label[['Y']], on='Date')
    df_complete = df_complete.sort_values('Date')
    
    super_merged_df = super_merged_df.append(df_complete, ignore_index=True)
    
count = super_merged_df.shape[0]
print(count)

print(super_merged_df.tail())

# 5 Splitting Data in training and testing
x = super_merged_df[['Entwicklungsrate Preis t+10',
                    'Entwicklungsrate Preis t+20',
                    'Entwicklungsrate Preis t+30',
                    'Entwicklungsrate Preis t+40',
                    'Entwicklungsrate Preis t+50',
                    'Entwicklungsrate Preis t+60',
                    'Entwicklungsrate Preis t+70',
                    'Entwicklungsrate Preis t+80',
                    'Entwicklungsrate Preis t+90',
                    'Entwicklungsrate Volume t+10',
                    'Entwicklungsrate Volume t+20',
                    'Entwicklungsrate Volume t+30',
                    'Entwicklungsrate Volume t+40',
Exemple #28
0
    for i in ch:
        # print(i.tag)    # {current}local .....
        local_name = i.text  # 속성을 잡는것이 아니라 값을 잡아야하니 text다.
        # print(local_name)
        ta = i.get('ta')
        desc = i.get('desc')
        datas += [[local_name, ta, desc]]  # DataFrame에 집어넣으려고 list타입에 넣었다.

print(datas)

from pandas import DataFrame

df = DataFrame(datas, columns=['지역', '온도', '기상상태'])
print(df.head())  # 앞에서 몇개만 출력
print(df['지역'])  # 모든 행의 지역만 출력
print(df.tail(3))  # 뒤에서 3개만 읽는다

# ===================위에서 두단계로 나뉜 작업을 하나로 간편하게 하는 방법 =================================================================
print('\n\n웹자료 읽어 바로 출력===============================================================')
# import urllib.request

webdata2 = urllib.request.urlopen('http://www.kma.go.kr/XML/weather/sfc_web_map.xml')
xmlFile = etree.parse(webdata2)
root = xmlFile.getroot()
ndate = list(root[0].attrib.values())  # values값만 읽는다.
print(ndate)
print(ndate[0] + '년 ' + ndate[1] + '월 ' + ndate[2] + '일 ' + ndate[3] + '시')

for child in root:
    for subChild in child:
        print(subChild.text + ' : ' + subChild.attrib.get('ta'))

# zscore normalization for petal length
# standard deviation taken from description of the data
def zScoreNorm(num):

    return ((num - m) / 1.76)


iris_data_c = iris_data
iris_data_c['Petal Length'] = iris_data_c['Petal Length'].apply(zScoreNorm)
norm_zscore_data = iris_data_c['Petal Length']
print "norm_data"
print norm_zscore_data

print iris_data
print iris_target

iris_target['Species'] = iris_target['Species'].apply(flower_type)
print iris_target.head()
print iris_target.tail()

iris = pd.concat([iris_data, iris_target], axis=1)
print iris

sns.pairplot(iris, hue='Species', size=2)
sns.plt.show()

sns.factorplot('Petal Length', data=iris, hue='Species', size=8, kind='count')
sns.plt.show()
def zScoreNorm(num):

	return ((num - m)/1.76)

iris_data_c = iris_data
iris_data_c['Petal Length'] = iris_data_c['Petal Length'].apply(zScoreNorm)
norm_zscore_data = iris_data_c['Petal Length']
print "norm_data"
print norm_zscore_data

print iris_data
print iris_target

iris_target['Species'] = iris_target['Species'].apply(flower_type)
print iris_target.head()
print iris_target.tail()


iris = pd.concat([iris_data,iris_target],axis=1)
print iris 


sns.pairplot(iris,hue='Species',size=2)
sns.plt.show()


sns.factorplot('Petal Length',data=iris,hue='Species',size=8,kind='count')
sns.plt.show()


Exemple #31
0
# Reference data by column name or method of dataframe variable
print(aonao['NAO'])
print(aonao.NAO)

# Add column to dataframe
aonao['Diff'] = aonao['AO'] - aonao['NAO']

# Show first several lines of new dataframe
print(aonao.head())

# Remove column from dataframe
del aonao['Diff']

# Show last few lines of dataframe
print(aonao.tail())

# Show slice from dataframe
print(aonao['1981-01':'1981-03'])

# Complex indexing example
import datetime

aonao.loc[(aonao.AO > 0) & (aonao.NAO < 0) &
          (aonao.index > datetime.datetime(1980, 1, 1)) &
          (aonao.index < datetime.datetime(1989, 1, 1)),
          'NAO'].plot(kind='barh')

#
#
# NEXT SECTION OF TUTORIAL (STATISTICS)
obj = Series([1,2,3])

obj.reindex()

data = DataFrame([[1,2,3],[4,5,6]])
data.drop()

np.argsort()

obj.rank()

obj.sort_values()


data.tail()

data.cov()

data.cov()

data.corr()

data.dropna()

data.loc


data.fillna()

data.unstack()
Exemple #33
0
    '薪资': Series(data=salary_list,index=index_list),
    '婚姻状况': Series(data=marital_list,index=index_list)
    }
df=DataFrame(dic)

# 方法一:遍历薪水列
for value in df['薪资']:
    print(value)

# 方法二:遍历薪水列
for index,row_data in df.iterrows():
    print(row_data['薪资'])

# 方法三:遍历薪水列
for col,col_data in df.iteritems():
    if col == '薪资':
        print(col_data)

# 获取最大薪资
for col,col_data in df.iteritems():
    if col == '薪资':
        # 将薪资中的k去掉并转化成float类型
        list1 = [float(value[:len(value)-1]) for value in col_data]
        # 排序
        max_salary = sorted(list1,reverse=True)[0]
        print(max_salary)
print('###################')
df
df.tail(2)
df.head(2)
df.values
Exemple #34
0
def tail(df: DataFrame, *args, **kwargs) -> DataFrame:
    """
    Convenience function for R users 
    """    

    return df.tail( *args, **kwargs)
Exemple #35
0
#access indexes and columns
print(df.columns)
print(df['Industry'])

#multiple columns
print(DataFrame(df, columns=['Rank', 'Industry', 'Name']))

#NaN values
df2 = DataFrame(df, columns=['Rank', 'Industry', 'Name', 'Profit'])
print("New dataFrame=")
print(df2)

#head and tail
print(df2.head(4))  #prints first 5 rows
print(df2.tail(4))  #prints last 5 rows

#access rows in dataframe
#print(df.ix[0]) #does not work
print(df.iloc[0])  #first row
print(df.loc[5])  #5th row

#assign values to dataframe using numpy
a1 = np.array([1, 2, 3, 4, 5, 6, 7, 8])
df2['Profit'] = a1
print(df2)

#using series
profit = Series([900, 100], index=[3, 5])
df2['Profit'] = profit
print(df2)
Exemple #36
0
    def _undifference_timeseries(self, historical_data: pd.DataFrame,
                                 forecasted_data: list):

        return np.cumsum(
            historical_data.tail(1).value.to_list() +
            forecasted_data).tolist()[1:]
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

df = DataFrame(np.random.randn(1000, 5))

print(df)

#basic observations
print(df.head())

print(df.tail())

print(df.describe())

column = df[0]

print(column.head())

print(column[np.abs(column) > 3])

#any -> aleast one element in the row
print(df[(np.abs(df) > 3).any(1)])

df[(np.abs(df) > 3)] = np.sign(df) * 5

print(df.describe())
Exemple #38
0
                   ('A','b'): [0,1,2,3,-1],
                   ('B','a'): [-20,-10,0,10,20],
                   ('B','b'): [-200,-100,0,100,200]})
p
mask = p.loc[:]<0
p[mask] = 1000
p


# In[488]:


# Set the seed
np.random.seed(121)
dframe = DataFrame(np.random.randn(1000, 5))
dframe.tail(5)
dframe.describe()


# In[489]:


col_0 = dframe[0]
col_0.tail()

# Show me the values of this Series that is > 3
col_0[np.abs(col_0) > 3]


# In[490]:
print(iris.DESCR)

iris_data = DataFrame(X, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
iris_target = DataFrame(Y, columns=['Species'])

def flower(num):
    if num == 0:
        return 'Setosa'
    elif num == 1:
        return 'Vericolour'
    else:
        return 'Virginica'

iris_target['Species'] = iris_target['Species'].apply(flower)
print(iris_target.tail())

iris = pd.concat([iris_data, iris_target], axis=1)

sns.pairplot(iris, hue='Species', size=2)
plt.show()

sns.factorplot('Petal Length', data=iris, hue='Species', size=10)
plt.show()

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

logreg = LogisticRegression()

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state=3)
from pandas import Series, DataFrame

__author__ = 'wangjj'
__mtime__ = '20161010下午 11:04'
data = {'Scala': 2003, 'Java': 1995, 'Python': 1991, 'Go': 2009}
ser = Series(data)
print ser
print 'C' in ser
print 'Go' in ser
print ser.values.mean()
print '----'
datas = {
    'name': [
        'Wangdachui',
        'Linling',
        'Niuyun'],
    'pay': [
        4000,
        5000,
        6000]}
dataFra=DataFrame(datas)
print DataFrame(datas)
print '----'
print dataFra['name']
print '----'
print dataFra.pay
print '----'
print dataFra.head(2)
print '----'
print dataFra.tail(2)
Exemple #41
0
dframe

years = [1990, 1991, 1992, 2008, 2012, 2015, 1987, 1969, 2013, 2008, 1999]
# これを10年ごとにまとめてみます。
decade_bins = [1960, 1970, 1980, 1990, 2000, 2010, 2020]

decade_cat = pd.cut(years, decade_bins)
decade_cat.shape

decade_cat.categories
pd.value_counts(decade_cat)

np.random.seed(12345)
dframe = DataFrame(np.random.randn(1000, 4))
dframe.head()
dframe.tail()

dframe.describe()
col = dframe[0]
col.head()
col[np.abs(col) > 3]
np.abs(-3.33)
dframe[(np.abs(dframe) > 3).any(1)]
np.sign(dframe)

dframe = DataFrame(np.arange(4 * 4).reshape((4, 4)))
blender = np.random.permutation(4)
blender

dframe
dframe.take(blender)
# In[195]:

KM=[kmeans(X,k) for k in K]
print type(KM),len(KM)


# In[196]:

KM_df=DataFrame(KM)
print KM_df.head(1)


# In[197]:

print KM_df.tail(1)


# In[198]:

KM_df.shape


# In[199]:

KM_v1=KM_df[0]
print type(KM_v1)


# In[200]:
    ctxs = [
        dict(X=X, y=y, linear_regression=fct.fit, x_name=n)
        for n in range(10, n, h)
    ]
    return ctxs


## Get time execution

# sklearn ElasticNet

ctxs = get_vectors_elastic(100000)
df = DataFrame(
    list(measure_time_dim('linear_regression(X, y)', ctxs, verbose=1)))
df['fct'] = 'ElasticNet'
print(df.tail(n=3))
dfs = [df]

# naive & c++ implemantation

for fct in [nv_regular_linreg, cpp_regular_linreg]:
    ctxs = get_vectors(fct, 100000)

    df = DataFrame(
        list(
            measure_time_dim(
                'linear_regression(X, y, beta, alpha, L1_ratio, max_iter, tol, num_samples, num_features)',
                ctxs,
                verbose=1)))
    df['fct'] = fct.__name__
    dfs.append(df)
df.to_csv('births1880.txt',index=False,header=False)

Location = r'births1880.txt'

df = read_csv(Location)

print df

print df.head()

df = read_csv(Location,header=None)

print df

print df.tail()

df = read_csv(Location, names = ['Names','Births'])

print df.head()

import os

os.remove(Location)

print df['Names'].unique()

for x in df['Names'].unique():
    print x

print df['Names'].describe()
Exemple #45
0
time_series = {}
for code, d in zip(codes,data):
    d.index = d.DATE
    time_series[code] = d.VALUE
merged_data = DataFrame(time_series)
# Unequal length series
print(merged_data)

term_premium = merged_data['GS10'] - merged_data['GS1']
term_premium.name = 'Term'
merged_data = merged_data.join(term_premium,how='outer')
default_premium = merged_data['BAA'] - merged_data['AAA']
default_premium.name = 'Default'
merged_data = merged_data.join(default_premium,how='outer')
merged_data = merged_data.drop(['AAA','BAA','GS10','GS1'],axis=1)
print(merged_data.tail())

quarterly = merged_data.dropna()
print(quarterly.tail())

growth_rates_selector = ['GDPC1','INDPRO','CPILFESL']
growth_rates = quarterly[growth_rates_selector].pct_change()
final = quarterly.drop(growth_rates_selector, axis=1).join(growth_rates)

new_names = {'GDPC1':'GDP_growth','INDPRO':'IP_growth','CPILFESL':'Inflation','UNRATE':'Unemp_rate'}
final = final.rename(columns = new_names ).dropna()
final.to_hdf('FRED_data.h5','FRED',complevel=6,complib='zlib')
final.to_excel('FRED_data.xlsx')

ax = final[['GDP_growth','IP_growth','Unemp_rate']].plot(subplots=True)
fig = ax[0].get_figure()
Exemple #46
0
nao = np.loadtxt('norm.nao.monthly.b5001.current.ascii')
dates_nao = pd.date_range('1950-01', periods=nao.shape[0], freq='M')
NAO = Series(nao[:, 2], index=dates_nao)
NAO.index
aonao = DataFrame({'AO': AO, 'NAO': NAO})

aonao.plot(subplots=True)
aonao.head()
aonao['NAO']
aonao.NAO
#Creating a column using the data within dataframe
aonao['Diff'] = aonao['AO'] - aonao['NAO']
aonao.head()
#Removing the column from a dataframe
del aonao['Diff']
aonao.tail()

aonao['1981-01':'1981-03']  #Selecting a specific timeframe

import datetime
aonao.loc[(aonao.AO > 0) & (aonao.NAO < 0)
          & (aonao.index > datetime.datetime(1980, 1, 1))
          & (aonao.index < datetime.datetime(1989, 1, 1)),
          'NAO'].plot(kind='barh')

aonao.mean()

aonao.max()
aonao.min()
aonao.mean(1)
Exemple #47
0
import xlrd  # xlsを読み込む際に必要
import numpy as np
import sqlite3

# データフレームを作る
smp = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nebada', 'Nebada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.6, 1.7, 3.5, 4.3]
       }
frame = DataFrame(smp)

# データフレームの要素へのアクセス
frame.year  # frame$year
frame['year']  # frame$year
frame.head()  # head
frame.tail()  # tail
frame2 = DataFrame(
    smp, index=['one', 'two', 'three', 'four', 'five'])  # インデックスを追加
frame2.ix['one']
frame2.describe()  # summary
print(frame2.describe())

# データを読み込む
data = pd.read_csv('stock_px.csv')
print(data)
xlsx_file = pd.ExcelFile('stock_px.xlsx')  # openpyxlのインストールが必要, xlsも可
xlsx_file.sheet_names
data = xlsx_file.parse('stock_px')
print(data)

# web上のデータを読み込む→http://docs.scipy.org/doc/numpy/reference/generated/numpy.DataSource.html
Exemple #48
0
from pandas import DataFrame
from Data import grade_dic

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# 전체에 대한 마지막 2줄만 추출
# 파라미터가 없을 경우 5줄이 기본
tail_data = df.tail()
print(tail_data)
print('-' * 30)

tail_data2 = df.tail(2)
print(tail_data2)