Ejemplo n.º 1
0
    def test_combine_series(self):
        s = self.panel['ItemA'][:10]
        result = self.panel.add(s, axis=0)
        expected = DataFrame.add(self.panel, s, axis=0)
        assert_frame_equal(result, expected)

        s = self.panel.ix[5]
        result = self.panel + s
        expected = DataFrame.add(self.panel, s, axis=1)
        assert_frame_equal(result, expected)
Ejemplo n.º 2
0
    def test_fill_value_when_combine_const(self):
        # GH12723
        dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
        df = DataFrame({'foo': dat}, index=range(6))

        exp = df.fillna(0).add(2)
        res = df.add(2, fill_value=0)
        assert_frame_equal(res, exp)
Ejemplo n.º 3
0
    def test_fill_value_when_combine_const(self):
        # GH12723
        dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
        df = DataFrame({'foo': dat}, index=range(6))

        exp = df.fillna(0).add(2)
        res = df.add(2, fill_value=0)
        assert_frame_equal(res, exp)
Ejemplo n.º 4
0
def main():
    begin_date = '20200401'
    end_date = '20210125'
    factors = ['Beta', 'ChipsCV', 'Close', 'CloseToAverage', 'HK', 'Jump', 'MC', 'MCNL', 'MomentumInd', 'RQPM', 'Sigma', 'Skew', 'TurnRate', 'Reversal', 'Value']

    #获取股票超额收益的预测值
    IC_hat = pd.read_csv('%s/Results/IC_hat.csv'%gc.IC_PATH, index_col=[0], parse_dates=[0])
    IC_hat = IC_hat.loc[IC_hat.index>begin_date, :]
    IC_hat = IC_hat.loc[IC_hat.index<end_date, :]
    
    y = pd.read_csv('%s/Data/y1.csv'%gc.LABELBASE_PATH, index_col=[0], parse_dates=[0])
    y = y.loc[y.index>begin_date, :]
    y = y.loc[y.index<end_date, :]
    
    y_hat = DataFrame(0, index=y.index, columns=y.columns)
    for factor in factors:
        factor_df = pd.read_csv('%s/Data/%s.csv'%(gc.FACTORBASE_PATH, factor), index_col=[0], parse_dates=[0])
        factor_df = factor_df.loc[factor_df.index>begin_date, :]
        factor_df = factor_df.loc[factor_df.index<end_date, :]
        y_hat = y_hat.add(factor_df.mul(IC_hat.loc[:, factor], axis=0), fill_value=0)
    
    stock_num = 20
    turn_rate = 0.2
    trade_num = int(stock_num * turn_rate)
    
    df_position = DataFrame(index=y.index, columns=list(range(stock_num)))
    df_position.iloc[0, :] = list(y_hat.iloc[0, :].sort_values(ascending=False).iloc[:stock_num].index)

    df_pnl = DataFrame(0, index=y.index, columns=list(range(stock_num)))

    pre_date = df_position.index[0]
    for date in df_position.index[1:]:
        pre_position = list(df_position.loc[pre_date, :])
        position = list(y_hat.loc[date, pre_position].sort_values(ascending=False).dropna().iloc[:(stock_num-trade_num)].index)
        
        stocks = y_hat.loc[date, :].sort_values(ascending=False).index
        for stock in stocks:
            if stock not in position:
                if pd.notna(y.loc[date, stock]):
                    position.append(stock)
                    if len(position) >= stock_num:
                        break
        position.sort()
        df_position.loc[date, :] = position
        df_pnl.loc[date, :] = y.loc[date, position].values
        pre_date = date
    pnl = df_pnl.mean(1)
    df_position.to_csv('%s/Results/df_position.csv'%gc.BACKTEST_PATH)
    df_pnl.to_csv('%s/Results/df_pnl.csv'%gc.BACKTEST_PATH)
    pnl.to_csv('%s/Results/pnl.csv'%gc.BACKTEST_PATH)
    
    plt.figure(figsize=(16,12))
    pnl.cumsum().plot()
    y.mean(1).cumsum().plot()
    
    (pnl - y.mean(1)).cumsum().plot()
    plt.legend(['PNL', 'BENCHMARK', 'ALPHA'])
    plt.savefig('%s/Results/backtest.png'%gc.BACKTEST_PATH)
Ejemplo n.º 5
0
def get_no_rebalancing_port_daily_value_df(weight_series: pd.Series, daily_return_df: pd.DataFrame) -> pd.DataFrame:
    """
    리밸런싱이 없는 포트폴리오 자산 value
    시작 value 는 합산 1
    """
    assert weight_series.sum() == 1
    initial_value_series = weight_series
    daily_value_df = daily_return_df.add(1).cumprod().multiply(initial_value_series)
    return daily_value_df
Ejemplo n.º 6
0
def sens_to_zero_rates(contract, market, curve_ccy, rate_key, reporting_ccy):
    """Sensitivity of each cashflow to the curve specified by currency and key

    A leg that pays IBOR is sensitive to both the discount and tenor curve
     of the currency in which the cash flows (coupons) are paid.
    """
    df_sens = DataFrame(columns=['ttm', 'sens', 'ccy', 'curve'])
    if curve_ccy == contract.currency:

        forwards = ibor_rate(contract, market)
        # replace rate with forwards for any fixing date after valuation date
        a = contract.frame
        a.rate = a.rate.where(a.fixing < market.dt_valuation, forwards)

        zcb_pay = market.discount_factor(a.pay, currency=contract.currency)

        if rate_key == 'discount':
            unpaid = a.pay >= market.dt_valuation
            crv = market.discount_curve(curve_ccy)
            pay_dates = a.pay[unpaid]
            ttm_pay = crv.daycount_fn(market.dt_valuation, pay_dates)
            sens = -ttm_pay * (zcb_pay * a.notional * a.rate * a.period).loc[unpaid]
            if contract.notl_exchange and unpaid.any():
                sens.iloc[-1] += a.notional.iloc[-1]
            if reporting_ccy != contract.currency:
                sens *= market.fx(reporting_ccy, contract.currency)
            df_sens = DataFrame({'ttm': ttm_pay, 'sens': sens,
                                 'ccy': curve_ccy, 'curve': 'discount'})
        elif rate_key == contract.frequency:  # TODO - Review and add comments
            crv, crv_key = market.curve(contract.currency, contract.frequency)
            unfixed = a.fixing >= market.dt_valuation
            pay_dates = a.pay.loc[unfixed]
            ttm_pay = crv.daycount_fn(market.dt_valuation, pay_dates)
            zcbi_pay = crv.discount_factor(pay_dates)

            fix_dates = a.fixing.loc[unfixed]
            ttm_fix = crv.daycount_fn(market.dt_valuation, fix_dates)
            zcbi_fix = crv.discount_factor(contract.frame.fixing)

            scale_factor = zcbi_fix / zcbi_pay * (a.notional * zcb_pay).loc[unfixed]
            sens_pay = ttm_pay * scale_factor
            sens_fix = -ttm_fix * scale_factor

            if reporting_ccy != contract.currency:
                fx = market.fx(reporting_ccy, contract.currency)
                sens_pay *= fx
                sens_fix *= fx

            df_pay = DataFrame({'ttm': ttm_pay, 'sens': sens_pay}).set_index('ttm')
            df_fix = DataFrame({'ttm': ttm_fix, 'sens': sens_fix}).set_index('ttm')
            df_sens = df_pay.add(df_fix, fill_value=0)

            df_sens['ttm'] = df_sens.index
            df_sens['ccy'] = curve_ccy
            df_sens['curve'] = crv_key

    return df_sens
Ejemplo n.º 7
0
Archivo: combo.py Proyecto: ageek/ramp
 def combine(self, datas):
     count = DataFrame(np.zeros(len(datas[0])), index=datas[0].index)
     eps = 1.0e-8
     col_names = []
     for data in datas:
         for col in data.columns:
             d = data[col]
             m = d.mean()
             s = d.std()
             if s < eps:
                 continue
             d = d.map(lambda x: self.is_outlier(x, m, s))
             col_names.append(col)
             count = count.add(d, axis=0)
     count.columns = [','.join(col_names)]
     return count
Ejemplo n.º 8
0
 def combine(self, datas):
     count = DataFrame(np.zeros(len(datas[0])), index=datas[0].index)
     eps = 1.0e-8
     col_names = []
     for data in datas:
         for col in data.columns:
             d = data[col]
             m = d.mean()
             s = d.std()
             if s < eps:
                 continue
             d = d.map(lambda x: self.is_outlier(x, m, s))
             col_names.append(col)
             count = count.add(d, axis=0)
     count.columns = [','.join(col_names)]
     return count
Ejemplo n.º 9
0
def add_noise(df: DataFrame, a=-1, b=1, method='add') -> DataFrame:
    """
    Add noise to all rows in a DataFrame
    :param df: The DataFrame we want to add noise to
    :param a: lower limit of random number that we'll use
    :param b: upper limit of random number that we'll use
    :param method: 'add' or 'mul'. The method used to add noise: add or multiply every value with a random value
    :return: A new DataFrame with random numbers between 'a' and 'b' added to every row
    """
    import numpy as np

    rand_vector = (b - a) * np.random.random_sample(
        (len(df), len(df.columns))) + a
    print(rand_vector)

    if method == 'add':
        return df.add(rand_vector, axis=0)
    if method == 'mul':
        return df.mul(rand_vector, axis=0)

    raise AssertionError("Method must be either 'add' or 'mul'")
Ejemplo n.º 10
0
def sum_boxscores(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
    """Adds two dataframes. It won't sum the `'number'` column.
    :param df1: First dataframe to add.
    :param df2: Second dataframe to add.
    :return: The sum of the dataframes.
    """
    numbers1 = df1.loc[:, "number"]
    numbers2 = df2.loc[:, "number"]
    dorsales = numbers1.combine(numbers2, lambda x, y: x if pd.isna(y) else y)

    minutes1 = df1.loc[:, "minutes"]
    minutes2 = df2.loc[:, "minutes"]
    minutes_sum = minutes1.add(minutes2, fill_value=pd.to_timedelta(0.0))

    df1 = df1.drop("number", axis="columns")
    df2 = df2.drop("number", axis="columns")
    df1 = df1.drop("minutes", axis="columns")
    df2 = df2.drop("minutes", axis="columns")
    df_sum = df1.add(df2, fill_value=0)
    df_sum.loc[:, "number"] = dorsales
    df_sum.loc[:, "minutes"] = minutes_sum
    return df_sum
Ejemplo n.º 11
0
import numpy as np

from pandas import DataFrame
from pandas import Series


data = np.arange(100,80,-1)
s = Series(data)
dic = {
    'line01':np.arange(20,40),
    'line03':np.linspace(30,35,20),
    'line02':np.arange(80,60,-1)
}
f = DataFrame(dic,index=np.arange(100,80,-1))
f2 = DataFrame({'line03':np.linspace(30,35,10),'line04':np.arange(10)},index=np.arange(100,90,-1));
f3 = f.add(f2)
# 求和
# 求每一列的和
sum = f.sum()
# 求指定列的和
sum1 = f[['line01','line02']].sum()
# 求每一行的和
sum2 = f.sum(axis=1)
# print sum
# print sum1
# print sum2
# 若该行或列有NaN则运行结果为NaN,默认skipna为True,忽略NaN
sum4 = f3.sum(skipna=False)
# print sum4

# 获取所有值
Ejemplo n.º 12
0
DataFrame 기본 연산
'''

from pandas import DataFrame
import numpy as np

# DataFrame 생성 
frame1 = DataFrame(np.arange(0,9).reshape(3,3),
                   columns=list('abc'))
frame2 = DataFrame(np.arange(1,10).reshape(3,3),
                   columns=list('abc'))
print(frame1)
print(frame2)

# frame 덧셈
add = frame1.add(frame2)
print(add)

# frame 뺄셈
sub = frame2.sub(frame1)
print(sub)

# frame 나눗셈 div = frame2 / frame1
div = frame2.div(frame1)
print(div) # inf : 부모가 0인 경우 

# frame 곱셈 
mul = frame1.mul(frame2)
print(mul)

# 행/열 단위 합계/평균/최댓값/최솟값
Ejemplo n.º 13
0
test_person=list(test_person)
test_person.sort()
Test_Dataset=[]
for i in test_person:
    X=Dataset_originall[Dataset_originall['subject#']==i]#subjectが1(1番の人)全体をdataframeそのものとして取り出す
    X1=[X.iloc[:,4].values,X.iloc[:,5].values,X.iloc[:,6:].values]#取り出したDataFrameからラベルyと特徴量Xを取り出す
    Test_Dataset.append(X1)

#    
#次に各人のデータを区別せずにまとめて入れます
#
training_person_set=set(training_person)

TRAININGDATA=DataFrame()
for i in training_person:
    TRAININGDATA =TRAININGDATA.add(Dataset_originall[Dataset_originall['subject#']==i],fill_value=0)
X_training=TRAININGDATA.iloc[:,6:].values
y1_training=TRAININGDATA.iloc[:,4].values
y2_training=TRAININGDATA.iloc[:,5].values
                           
                             
TESTDATA=DataFrame()
for i in test_person:
    TESTDATA =TESTDATA.add(Dataset_originall[Dataset_originall['subject#']==i],fill_value=0)
X_test=TESTDATA.iloc[:,6:].values
y1_test=TESTDATA.iloc[:,4].values
y2_test=TESTDATA.iloc[:,5].values

#特徴量の標準化 本来ならこうすべきだが、今回は全体の標準化量を計算する
def Scaler(X,mean,variance):
    return (X-mean)/variance
Ejemplo n.º 14
0
def main():
    # reindex
    obj = Series(range(4), index="a b c d".split(" ")[::-1])
    print obj

    obj2 = obj.reindex("a b c d e".split(" "))
    print obj2

    # Change NaN
    print obj.reindex("a b c d e".split(" "), fill_value=0)
    colors = ["blue", "purple", "yellow"]
    index = [0, 2, 4]
    obj3 = Series(colors, index=index)
    print obj3.reindex(range(6))
    print obj3.reindex(range(6), method="ffill")  # not found forward fill
    print obj3.reindex(range(6), method="backfill")  # bfill

    # DataFrame
    states = ["Ohio", "Texas", "California"]
    frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"])
    print frame
    frame2 = frame.reindex("a b c d".split(" "))
    print frame2
    states[0] = "Utah"
    states[1], states[0] = states[:2]
    print frame.reindex(columns=states)
    # fill
    print frame.reindex("a b c d".split(" "), method="ffill", columns=states)
    print frame.ix["a b c d".split(" ")]
    print frame.ix["a b c d".split(" "), states]

    # Delete column
    print "", ""
    obj = Series(range(5), index="a b c d e".split(" "))
    new_obj = obj.drop("c")
    print new_obj
    print obj

    # Index reference
    print "", ""
    obj = Series(np.arange(4.0), index="a b c d".split(" "))
    print obj["b"]
    print obj[1]  # same
    print obj[2:4]
    print obj[["b", "a", "c"]]
    print obj[[1, 3]]
    print obj[obj < 2]
    # Slice with label
    print obj["b":"c"]  # include 'c'
    obj["b":"c"] = 5
    print obj

    data = DataFrame(
        np.arange(16).reshape((4, 4)),
        index=["Ohio", "Colorado", "Utah", "New York"],
        columns=["one", "two", "three", "four"],
    )
    print data
    # column
    print data["two"]
    print data[["three", "one"]]
    # row
    print data[:2]
    print data[data["three"] > 5]
    # all values
    print data < 5
    data[data < 5] = 0
    print data
    # row and column
    print data.ix[["Colorado"], ["two", "three"]]
    print data.ix[["Colorado", "Utah"], [3, 0, 1]]
    # row
    print data.ix[2]
    # label row and column, return column
    print data.ix[:"Utah", "two"]
    # xs
    # row
    print data.xs("Utah")
    print data.xs("Utah", axis=0)
    # rows
    print data.xs("two", axis=1)
    # icol/irow i is index
    print data.icol(1)
    print data.irow(1)

    # Union
    print "", ""
    s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
    s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
    print s1
    print s2
    # index is union, but d, f, g are NaN
    print s1 + s2
    df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"])
    df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print df1
    print df2
    print df1 + df2

    # arithmetic method
    print "", ""
    df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd"))
    df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde"))
    print df1
    print df2
    print df1.add(df2, fill_value=0)
    # reindex has fill_value argument
    # other arithmetic method are sub/div/mul(ti)

    # Calculation in a DataFrame and Series
    print "", ""
    # subtract from each row. broadcat
    arr = np.arange(12.0).reshape((3, 4))
    print arr
    print arr[0]
    print arr - arr[0]
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    series = frame.ix[0]
    print frame
    print series
    print frame - series

    series2 = Series(range(3), index=list("bef"))
    print frame + series2

    series3 = frame["d"]
    series4 = frame.ix[0]
    print frame
    print series3
    print series4
    print frame.sub(series3, axis=0)
    print frame.sub(series4, axis=1)

    # apply function and mapping
    print "", ""
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print frame
    f = lambda x: x.max() - x.min()
    print frame.apply(f)
    print frame.apply(f, axis=1)

    f = lambda x: Series([x.min(), x.max()], index=["min", "max"])
    print frame.apply(f)

    format = lambda x: "{0:.2f}".format(x)
    print frame.applymap(format)  # frame
    print frame["e"].map(format)  # series

    # sort and rank
    print "", ""
    obj = Series(range(4), index=list("dabc"))
    print obj
    print obj.sort_index()

    frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
    print frame
    print frame.sort_index()
    print frame.sort_index(axis=1)
    print frame.sort_index(axis=1, ascending=False)

    # Sorting series
    print "", ""
    obj = Series([4, 7, -3, 2])
    print obj.order()
    obj = Series([4, np.nan, 7, np.nan, -3, 2])
    print obj.order()
    print obj.order(ascending=False)

    # order by multi columns
    print "", ""
    frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
    print frame.sort_index(by=["a", "b"])

    # rank
    print "", ""
    obj = Series([7, -5, 7, 4, 2, 0, 4])
    print obj.rank()  # method is average
    print obj.rank(method="first")  # No Duplicates
    print obj.rank(ascending=False, method="min")
    print obj.rank(ascending=False, method="max")
    f1 = DataFrame(obj, columns=["data"])
    f2 = DataFrame(obj.rank(), columns=["rank"])
    # merge by each index
    print pd.merge(f1, f2, left_index=True, right_index=True)

    # Index of the axis with duplicate values
    print "", ""
    obj = Series(range(5), index=list("aaabc"))
    print obj
    print obj.index.is_unique
    print obj["a"]
    print obj["c"]

    df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd"))
    print df
    print df.ix["b"]
    print df["c"]
Ejemplo n.º 15
0
print(s1+s2) # 위와 같은 결과
# index 기준으로 연산 수행
# 겹치는 index가 없을경우 NaN으로 반환

print("\n=======================================================")
print("Dataframe_operation")
print("=======================================================")
df1 = DataFrame(np.arange(9).reshape(3,3), columns=list("abc"))
print(df1)
print("\n")
df2 = DataFrame(np.arange(16).reshape(4,4), columns=list("abcd"))
print(df2)
print("\n")
print(df1+df2)
print("\n")
print(df1.add(df2,fill_value=0))
# df는 coulmn과 index를 모두 고려해야함
# add operation을 쓰면 NaN값 0으로 변환(fill_value)
# Operation Types : add, sub, div, mul

print("\n=======================================================")
print("Series_DataFrame_operation")
print("=======================================================")
df = DataFrame(np.arange(16).reshape(4,4), columns=list("abcd"))
s = Series(np.arange(10,14), index=list("abcd"))
print(df+s)
# column을 기준으로 broadcasting이 발생

print("\n=======================================================")
print("map_apply_lambda")
print("=======================================================")
Ejemplo n.º 16
0
class YieldReader (object):
    def __init__ (self, yields = None, scale = 1.0):
        self.masses = [] if yields is None else [ind [0] for ind in yields.index]
        self.isotopes = [] if yields is None else [Isotope (iso) for iso in yields.columns]
        self.isotopes.sort ()
        self.yields = DataFrame () if yields is None else yields
        self.yields = self.yields.fillna (0.0)
        self.yields *= scale

    @classmethod
    def from_file (cls, filename, mass, **kwargs):
        self = cls ()
        self.add_file (filename, mass, **kwargs)
        return self

    @classmethod
    def from_directory (cls, directory = "yields/wh07/", mass_file = "masses", **kwargs):
        self = cls ()
        mass_file = open (directory + "/" + mass_file, "r")
        for line in mass_file:
            if line == "\n":
                continue
            line = line.rstrip ("\n").split (" ")
            try:
                self.add_file (directory + "/" + line [1], float (line [0]) * u.solMass, **kwargs)
            except IndexError:
                self.yields = self.yields.append (DataFrame ([{"mass": float (line [0]) * u.solMass, "file": directory + "/"}]).set_index (["mass", "file"]))
                self.masses.append (float (line [0]) * u.solMass)
        return self

    @classmethod
    def combine (cls, yield_readers):
        self = cls ()
        self.masses = u.Quantity (np.array (np.concatenate ([yr.masses for yr in yield_readers])))
        for yr in yield_readers:
            isotopeArray = yr.isotopes
            for iso in isotopeArray:
                if iso not in self.isotopes:
                    self.isotopes.append (iso)
        self.isotopes.sort ()
        for yr in yield_readers:
            dataframe = yr.yields
            self.yields = self.yields.append (dataframe)
        self.yields = self.yields.fillna (0.0)
        return self

    def add_file (self, filename, mass, winds = True, explosions = True, keplerYield = True, totalYieldName = "yieldall", windYieldName = "yieldwind", expYieldName = None, isotopeName = "isotope", table = 1):
        self.masses.append (mass)
        if keplerYield:
            i = fromKeplerYield (filename, table)
        else:
            i = 0
        result = np.genfromtxt (filename, skip_header = i, names = True, dtype = None)

        yieldDF = {}
        yieldDF ["mass"] = mass
        yieldDF ["file"] = filename
        for row in result:
            if row [isotopeName] == "total" or row [isotopeName] == b"total":
                break
            isotope = Isotope (row [isotopeName])
            if isotope not in self.isotopes:
                self.isotopes.append (isotope)
            yieldDF [isotope.string] = 0.0
            if winds and explosions and totalYieldName is not None:
                yieldDF [isotope.string] += float (row [totalYieldName])
            else:
                if winds:
                    yieldDF [isotope.string] += float (row [windYieldName])
                if explosions:
                    if expYieldName is None:
                        yieldDF [isotope.string] += float (row [totalYieldName]) - float (row [windYieldName])
                    else:
                        yieldDF [isotope.string] += row [expYieldName]
        self.yields = self.yields.append (DataFrame ([yieldDF]).set_index (["mass", "file"]))
        self.yields = self.yields.fillna (0.0)
        self.isotopes.sort ()
  
    def get_yield (self, isotope, massArray = None, tolerance = 0.0001):
        if isinstance (isotope, Isotope):
            isotope = isotope.string
        if isotope not in self.yields:
            if massArray is None:
                return u.Quantity ([0.0] * len (self.yields), u.solMass)
            return u.Quantity ([0.0] * len (massArray), u.solMass)
                
        if massArray is None:
            return u.Quantity (self.yields [isotope], u.solMass)
        return u.Quantity (self.yields [isotope].iloc [massArray], u.solMass)
        
    def get_masses (self):
        return self.masses

    def get_keys (self):
        return [i for i in self.yields.index]
        
    def __add__ (self, other):
        return YieldReader (self.yields.add (other.yields, fill_value = 0.0))
        
    def __mul__ (self, scalar):
        return YieldReader (self.yields, scalar)
        
    __rmul__ = __mul__
    
    def __div__ (self, scalar):
        return self * (1.0 / scalar)

    def __getitem__ (self, i):
        if isinstance (i, slice):
            return YieldReader (self.yields [i])
        return YieldReader (self.yields [i:i+1])
Ejemplo n.º 17
0
def combine_data(futures_info_a: pd.DataFrame, futures_info_b: pd.DataFrame):
    """combine futures close price according to the front to later"""
    if futures_info_b is None or futures_info_b.empty:
        return futures_info_a
    return futures_info_a.add(futures_info_b, fill_value=0)
Ejemplo n.º 18
0
import numpy as np
from pandas import DataFrame

npdata = np.random.randn(5, 3)
columnNames = ['x1', 'x2', 'x3']
data = DataFrame(npdata, columns=columnNames)

print('data =')
print(data)

columnNames = ['x1', 'x2', 'x3']
data2 = DataFrame(np.random.randn(5, 3), columns=columnNames)
print('\ndata2 =')
print(data2)

print('\ndata + data2 = ')
print(data.add(data2))

print('\ndata * data2 = ')
print(data.mul(data2))
Ejemplo n.º 19
0
from pandas import Series, DataFrame
import numpy as np

s1 = Series([5, 6, -1, 2], index=['a', 'c', 'd', 'e'])

s2 = Series([3, 4, -1, 2, 7], index=['a', 'c', 'e', 'f', 'g'])

print(s1+s2)

df1 = DataFrame(np.arange(9).reshape(3, 3), columns=list('bcd'),
                index=['seoul', 'busan', 'kwangju'])

df2 = DataFrame(np.arange(12).reshape(4, 3), columns=list('bde'),
                index=['Incheon', 'seoul', 'busan', 'suwon'])

print(df1)
print(df2)
print(df1+df2)

df3 = DataFrame(np.arange(12).reshape(3, 4), columns=list('abcd'))
df4 = DataFrame(np.arange(20).reshape(4, 5), columns=list('abcde'))

print(df3)
print(df4)

print(df3+df4)

print(df3.add(df4, fill_value=0))

print(df3.reindex(columns=df4.columns, fill_value=0))
Ejemplo n.º 20
0
df1 = DataFrame(np.arange(9).reshape(3, 3),
                columns=list('bcd'),
                index=['seoul', 'busan', 'kwangju'])
df2 = DataFrame(np.arange(12).reshape(4, 3),
                columns=list('bde'),
                index=['incheon', 'seoul', 'busan', 'suwon'])

print(df1 + df2)
# Series나 DataFrame나 같이 겹쳐있는 값이 있다면 연산하고 그 외에는 NaN과 연산하면 NaN이 되는 법칙에 의해 NaN이 된다.

df3 = DataFrame(np.arange(12).reshape(3, 4), columns=list('abcd'))
df4 = DataFrame(np.arange(20).reshape(4, 5), columns=list('abcde'))
print(df3 + df4)

print(df3.add(df4, fill_value=0))
# fill_value 속성은 NaN값은 0으로 채우겠다는 의미
# 결론적으로 df4의 값과 0이 더해진 값이 된다.

# DataFrame과 Series간의 연산
## Numpy의 브로드캐스팅과 유사하다
print(df3.reindex(columns=df4.columns, fill_value=0))

arr = np.arange(12, ).reshape(3, 4)
print(arr)
print(arr[0])
print(arr - arr[0])
#0 1 2 3    -    0 1 2 3
#4 5 6 7
#8 9 10 11
Ejemplo n.º 21
0
dframe1 = DataFrame(np.arange(4).reshape((2,2)),
                    columns = list('AB'),
                    index = ['NYC','LA'])
dframe1


dframe2 = DataFrame(np.arange(9).reshape((3,3)),
                    columns = list('ADC'),
                    index = ['NYC','SF','LA'])
dframe2

# adding dataframes
dframe1 + dframe2 # only adds where both row and column match, everything else will be null

dframe1
dframe1.add(dframe2, fill_value = 0) # doesn't add if row/column combination doesn't exist in either table


# operations between Series and DataFrame
ser3 = dframe2.ix[0]
ser3

dframe2 - ser3







Ejemplo n.º 22
0
df2 = DataFrame(np.arange(12).reshape((4, 3)),
                columns = list('bde'),
                index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print df1
print df2
print df1 + df2
print

print '数据填充'
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns = list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns = list('abcde'))
print df1
print
print df2
print
print df1.add(df2, fill_value = 0)
print df1.reindex(columns = df2.columns, fill_value = 0)
print

print 'DataFrame与Series之间的操作'
arr = np.arange(12.).reshape((3, 4))
print arr
print arr[0]
print arr - arr[0]
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
print frame
print series
print
Ejemplo n.º 23
0
s1 = Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = Series([4, 5, 6, 7], index=['a', 'b', 'd', 'c'])
print(s1)
print(s2)
print(s1 + s2)
print(s1.add(s2))  # 같은 인덱스명이 대응될 때 연산 가능(인덱스가 같아야 한다)

print()
df1 = DataFrame(np.arange(9.).reshape(3, 3),
                columns=list('kbs'),
                index=['서울', '인천', '수원'])
print(df1)
df2 = DataFrame(np.arange(12.).reshape(4, 3),
                columns=list('kbs'),
                index=['서울', '인천', '일산', '수원'])
print(df2)

print()
print(df1 + df2)  # 얘는 속성을 쓸 수 없다.
print(df1.add(df2))  # 얘는 속성을 쓸 수 있다.
print(df1.add(df2, fill_value=0))  # 얘는 속성(ex. fill_value)를 쓸 수 있다.

print()
print(df1 + df2)
print(df1.mul(df2))  # mul : 곱하기
print(df1.mul(df2, fill_value=0))

print()
seri = df1.iloc[0]  # df1의 1열 모두 출력
print(seri)
print(df1 - seri)  # broadcasting되서 연산 가능
Ejemplo n.º 24
0
ser2 = Series([3, 4, 5, 6], index=['A', 'B', 'C', 'D'])
ser2

# adding series
ser1 + ser2

dframe1 = DataFrame(np.arange(4).reshape((2, 2)),
                    columns=list('AB'),
                    index=['NYC', 'LA'])
dframe1

dframe2 = DataFrame(np.arange(9).reshape((3, 3)),
                    columns=list('ADC'),
                    index=['NYC', 'SF', 'LA'])
dframe2

# adding dataframes
dframe1 + dframe2  # only adds where both row and column match, everything else will be null

dframe1
dframe1.add(
    dframe2, fill_value=0
)  # doesn't add if row/column combination doesn't exist in either table

# operations between Series and DataFrame
ser3 = dframe2.ix[0]
ser3

dframe2 - ser3
Ejemplo n.º 25
0
data.ix['Colorado',['two','three']]
data.ix['Colorado',[3,0,1]]
data.ix['Colorado']
data['two']
data.two

## 算数运算对齐
s1 = Series([7.3,-2.5,3.4,1.5], index=['a','b','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1], index=['a','c','e','f','g'])
s1 + s2
df1 = DataFrame(np.arange(9).reshape((3,3)), columns=list('bcd'), index=['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12).reshape((4,3)),columns=list('bde'), index=['Utah','Ohio','Texas','Oregon'])
df1 + df2
# 所谓的对齐,就是索引相同的值做运算
# 填充值,可以给对不上的对象填充一个特殊值
df1.add(df2, fill_value=0)
# 这个只会填充df2中没有的对象。

## DataFrame和Series之间的运算
frame = DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=['Utah','Utahs','Texas','Oregon'])
series = frame.ix[0]
frame - series
# 每列都会减对应元素,这种被称为沿着行广播,如果想要沿着列广播,可以如下操作
series3 = frame['d']
series3 = frame.ix['d',:]
frame.sub(series3, axis=0)

## 函数的应用和映射
frame = DataFrame(np.random.randn(4,3), columns=list('bde'),
	index=['Utah','Ohio','Texas','Oregon'])
f = lambda x :x.max()-x.min()
Ejemplo n.º 26
0
frame5[:2]
frame5[frame5['three'] > 5]
frame5 < 5  #返回布林值
frame5[frame5 < 5] = 0
frame5  #利用frame5<0找出True&False,並在上面的式子設定返回小於5就等於0
frame5.loc['Colorado', ['two', 'three']]
frame5.ix[['Ohio', 'Utah'], [3, 0, 1]]  #利用ix可以更靈活地找子集
frame5.ix[frame5.three > 7, :3]

##算術運算和數據對齊
#Series跟DataFrame中重疊的行or行列才會合併,為重疊的值就是nan
frame6_1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
frame6_2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

frame6_1 + frame6_2
frame6_1.add(frame6_2, fill_value=0)  #傳入frame6_2的數值與一個fill_value參數
frame6_1.reindex(columns=frame6_2.columns, fill_value=0)  #再重新索引時,可以指定填充

#DataFrame跟Series之間的運算
yc10 = np.arange(12.).reshape((3, 4))
yc10
yc10 - yc10[1]

frame7 = DataFrame(np.arange(12.).reshape((4, 3)),
                   columns=list('abc'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series_7 = frame7.iloc[0]
frame7 - series_7  #每一層都會減掉,叫做廣播broadcasting
series_7_1 = frame7['b']
frame7.sub(series_7_1, axis=0)
Ejemplo n.º 27
0
    # 模型在测试集的表现
    print("In the test data, the model MSE is %g" % sess.run(loss,feed_dict={x:test_x,y_:test_y}))

    # 模型导出
    #     writer = tf.summary.FileWriter('log',sess.graph)
    # 模型持久化
    #     saver.save(sess,"model/{}.ckpt".format(KEYWORD))

    # 保存结果
    # 切换换回原来的坐标轴
    # 需要train_data,test_data,out_data
    out_data.index = pd.date_range(start=raw_data.index[0],freq=FREQ,periods=len(out_data))
    # 合并原始数据转成字典
    df_data1 = DataFrame({'curve':out_data})
    df_data2 = DataFrame({'data':train_data})
    df_data3 = DataFrame({'test':test_data})
    df_data = df_data1.add(df_data2,fill_value=0)
    df_data = df_data.add(df_data3,fill_value=0)
    df_data.plot(figsize=(20, 8.5))
#     j_data = json.loads(df_data.to_json(orient='split',date_unit='s'))
# 添加key,freq,option
#     j_data['key'] = KEYWORD
#     j_data['freq']=FREQ
#     j_data['option']='TensorFlow'
#     print(j_data)
# 保存到MongoDB
#     predict_col = pymongo.MongoClient('localhost')['scrapy']['predict']
#     predict_col.update_one({"key":j_data['key'],"freq":j_data['freq']},{"$set":j_data},upsert=True)

Ejemplo n.º 28
0
def sens_to_zero_rates(contract, market, curve_ccy, rate_key, reporting_ccy):
    """Sensitivity of each cashflow to the curve specified by currency and key

    A leg that pays IBOR is sensitive to both the discount and tenor curve
     of the currency in which the cash flows (coupons) are paid.
    """
    df_sens = DataFrame(columns=['ttm', 'sens', 'ccy', 'curve'])
    if curve_ccy == contract.currency:

        forwards = ibor_rate(contract, market)
        # replace rate with forwards for any fixing date after valuation date
        a = contract.frame
        a.rate = a.rate.where(a.fixing < market.dt_valuation, forwards)

        zcb_pay = market.discount_factor(a.pay, currency=contract.currency)

        if rate_key == 'discount':
            unpaid = a.pay >= market.dt_valuation
            crv = market.discount_curve(curve_ccy)
            pay_dates = a.pay[unpaid]
            ttm_pay = crv.daycount_fn(market.dt_valuation, pay_dates)
            sens = -ttm_pay * (zcb_pay * a.notional * a.rate *
                               a.period).loc[unpaid]
            if contract.notl_exchange and unpaid.any():
                sens.iloc[-1] += a.notional.iloc[-1]
            if reporting_ccy != contract.currency:
                sens *= market.fx(reporting_ccy, contract.currency)
            df_sens = DataFrame({
                'ttm': ttm_pay,
                'sens': sens,
                'ccy': curve_ccy,
                'curve': 'discount'
            })
        elif rate_key == contract.frequency:  # TODO - Review and add comments
            crv, crv_key = market.curve(contract.currency, contract.frequency)
            unfixed = a.fixing >= market.dt_valuation
            pay_dates = a.pay.loc[unfixed]
            ttm_pay = crv.daycount_fn(market.dt_valuation, pay_dates)
            zcbi_pay = crv.discount_factor(pay_dates)

            fix_dates = a.fixing.loc[unfixed]
            ttm_fix = crv.daycount_fn(market.dt_valuation, fix_dates)
            zcbi_fix = crv.discount_factor(contract.frame.fixing)

            scale_factor = zcbi_fix / zcbi_pay * (a.notional *
                                                  zcb_pay).loc[unfixed]
            sens_pay = ttm_pay * scale_factor
            sens_fix = -ttm_fix * scale_factor

            if reporting_ccy != contract.currency:
                fx = market.fx(reporting_ccy, contract.currency)
                sens_pay *= fx
                sens_fix *= fx

            df_pay = DataFrame({
                'ttm': ttm_pay,
                'sens': sens_pay
            }).set_index('ttm')
            df_fix = DataFrame({
                'ttm': ttm_fix,
                'sens': sens_fix
            }).set_index('ttm')
            df_sens = df_pay.add(df_fix, fill_value=0)

            df_sens['ttm'] = df_sens.index
            df_sens['ccy'] = curve_ccy
            df_sens['curve'] = crv_key

    return df_sens
data.drop('three',axis=1)

#直接 定位到元素 点 用ix
data.ix['Colorado',['one','four']]


#***************************
#对Series 相加  是对相同索引 的数据相加,没有的值 其和 最终会以NaN 来表示
list('abcd') 等价于 ['a','b','c','d']

df1=DataFrame(np.arange(12).reshape(3,4),columns=list('abcd'))

df2=DataFrame(np.arange(20).reshape(4,5),columns=list('abcde'))

df1.add(df2,fill_value=0) 对未有值的NaN 以0代替 带入加法

#add #sub #div #mul  加减乘除
#***********************************************
frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
frame.abs()

#自定义函数
f=lambda x:x.max()-x.min()
#用apply 来执行
frame.apply(f) #对列执行
frame.apply(f,axis=1) #对 行执行

#定义函数
def f(x):
    return Series([x.min(),x.max()],index=['min','nax'])
Ejemplo n.º 30
0
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from numpy import nan  #导入相应模块
import matplotlib.pyplot as plt
#插入数据
df1 = DataFrame(np.arange(12).reshape((3, 4)), columns=list("abcd"))
df2 = DataFrame(np.arange(20).reshape((4, 5)), columns=list("abcde"))
df1
df2
df3 = df1 + df2  #df1.add(df2)
df3.iloc[1, 3] = nan
df1.add(df2, fill_value=0)  # 为df1添加第3行和e这一列,并将其填充为0
df1.add(df2).fillna(0)  # 按照正常方式将df1和df2相加,然后将NaN值填充为0
print(df3)
df3.plot()
plt.show()
Ejemplo n.º 31
0
'''6.DataFrame的运算 同Series一样:在运算中自动对齐不同索引的数据如果索引不对应,则补NaN'''
# 6.1 DataFrame之间的运算
df1 = DataFrame(
    {
        'Python': [119, 120, 110],
        '数学': [130, 118, 112],
        '英语': [90, 137, 99]
    },
    index=['张三', '王五', '李四'])
df2 = DataFrame(data=np.random.randint(0, 150, size=(4, 4)),
                index=['张三', '王五', '李四', 'Michael'],
                columns=['Python', '数学', '物理', '英语'])

df1 + 10  # dataframe的每一个元素会加10!
df1 + df2  # dataframe相加会自动对准index和column,两者相同则自动相加,对不上的则自动补齐NAN!
df1.add(df2, fill_value=0)  # 使用这种方法可以使索引对不上者不会直接返回NAN,而是自动填上0使之进行计算
df2.add(df1, fill_value=0)  # 结果与上面完全一样
'''下面是Python 操作符与pandas操作函数的对应表:
+           add()
-           sub(), subtract()
*           mul(), multiply()
/           truediv(), div(), divide()
//          floordiv()
%           mod()
**          pow()
求平均值     mean()     
求方差       std()'''

# 6.2 DataFrame与series的运算!!!---好好理解这里是怎么回事,有一定难度。
df1 = DataFrame(
    {
Ejemplo n.º 32
0
# 算数运算与数据对齐
s1 = Series([1.1, 2.2, 3.3], index=['a', 'b', 'c'])
s2 = Series([-1.1, -2.2, -3.0, 4.4], index=['a', 'b', 'c', 'd'])
s3 = s1 + s2
print(s3)

d1 = DataFrame(np.arange(9).reshape((3, 3)),
               index=[1, 2, 3],
               columns=list('abc'))
d2 = DataFrame(np.arange(12).reshape((4, 3)),
               index=[1, 2, 3, 4],
               columns=list('cde'))
d3 = d1 + d2
print(d3)
d3 = d1.add(d2, fill_value=0)
print(d3)

# Dataframe与series之间的运算与排序
df1 = DataFrame(np.arange(12).reshape((4, 3)),
                columns=list('abc'),
                index=[1, 2, 3, 4])
s1 = Series(df1.loc[1])
print(df1)
print(s1)
dele = df1 - s1  #广播相减
print(dele)

s2 = Series(np.arange(3), index=['c', 'd', 'e'])
add1 = df1 + s2  # 不同索引会合并
print(add1)
# -*- coding: utf-8 -*-
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                 数据框和向量的计算规则
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

#%%
from pandas import DataFrame,Series
from string import letters
s1=Series(range(3),list(letters[:3]))
d1=DataFrame(
    {'a':range(0,3)},
    index=list(letters[:3])
)
d2=DataFrame(
    {'a':range(0,10),'b':range(10,20),'c':range(20,30)},
    index=list(letters[:10])
)

#%%  数据框相加使用的是对齐计算同位置元素
d1+d2
#%%  数据框和系列相加是按列对齐计算同位置元素,并按行扩展
d2+s1
#%% 使用add函数实现按行对齐
#axis表示用哪个label来对齐,而不是广播的方向
d2.add(s1,axis=0)  #  {0, 1, 'index', 'columns'} 
#%%
d2.add(s1,axis=1)


Ejemplo n.º 34
0
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn

df1 = pd.read_clipboard() #doc khung vua luu
df2 = DataFrame(np.arange(16).reshape(4,4), columns = "col1 col2 col3 col4".split(), index = list('ABCD'))
print(df1,'\n')
print(df1.columns,'\n')     #hien thi column
#print(df1.head(),'\n')		#5 row dau tien
print(df1[['col1', 'col3']])  #lay ra gia tri cot
print(df1[df1['col4'] > 50])	#lay gia tri col > 50
print(df1 > 50)					#kiem tra gia tri
print(df1.ix['A'])			#lay dong
print(df1.drop('B'))		#xoa dong
print(df1.drop('col3', axis = 1)) #xoa cot
print(df2)
print(df2.add(df1, fill_value = 0)) #phep cong 2 ma tran
print(df1.sum())			#tong moi cot
print(df1.sum(axis = 1))		#tong dong
print(df1.max())			#max cot
print(df1.idxmax())			#lay index de col max
Ejemplo n.º 35
0
# DataFrame的对齐
# 效果和Series一样,不过是同时对齐行和列
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', "Colorado"])
df2 = DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
# print df1
# print df2
df1_add_df2 = df1 + df2
# print df1_add_df2

# 填充默认值代替没有出现的值来参与计算
df1_default = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2_defaule = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
# 没有默认值
df1_add_df2_without = df1_default + df2_defaule
# 设置默认值,没有的值来参与计算
df1_add_df2_default = df1_default.add(df2_defaule, fill_value=0)
# print df1_add_df2_without
# print df1_add_df2_default

# 对应的算法方法
# add 加法
# sub 减法
# div 除法
# mul 乘法

# DataFrame 和 Series 之间的计算
arr = np.arange(12.).reshape((3, 4))
# broadcasting广播计算
# print arr - arr[0]
# DataFrame 与 Series之间的计算与broadcast类似
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame4 = DataFrame(
    {
        'a': np.random.randn(4),
        'b': ['foo', 'lol'] * 2,
        'c': [True, False] * 2
    },
    index=list('abcd'))
frame.ix['f'] = np.random.randn(4)
frame['loc'] = ['ST', 'MO'] * 3
frame.sort_index(axis=1)
frame.sort_values(by=['loc', 'STL'])
frame.rank(axis=0)
frame.rank(method='max')
um.order()
um.rank()
frame.add(frame2)
frame.corr(um)
frame.fillna(1, inplace='True')
um = frame['UM']
frame.corr()
frame.cov()
frame2.ix['f'] = np.random.randn(3)
frame.corrwith(frame2)
frame.corrwith(um)
frame.corrwith(um.to_frame())
frame.ix[:, 'Washu':'UMST'].apply(lambda x: x.mean())
frame.set_index('UM', drop=True, inplace=True)
keys = frame.index
frame.reset_index(level=keys)

df = DataFrame(np.random.randn(6, 5),
Ejemplo n.º 37
0
ser1 = Series([0, 1, 2], index=['A', 'B', 'C'])
ser1
ser2 = Series([3, 4, 5, 6], index=['A', 'B', 'C', 'D'])
ser2
ser1 + ser2
dframe1 = DataFrame(np.arange(4).reshape(2, 2),
                    columns=list('AB'),
                    index=['NYC', 'LA'])
dframe1
dframe2 = DataFrame(np.arange(9).reshape(3, 3),
                    columns=list('ADC'),
                    index=['NYC', 'SF', 'LA'])
dframe2
dframe1 + dframe2
dframe1.add(dframe2)
ser3 = dframe2.ix[0]
ser3
dframe2 - ser3

ser1 = Series(range(3), index=['C', 'A', 'B'])
ser1

ser1.sort_index()
ser1.sort_values()

ser2 = Series(randn(10))
ser2

arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])
arr
Ejemplo n.º 38
0
# In[ ]:


df1


# In[ ]:


df2


# In[ ]:


df1.add(df2, fill_value=0)   #与df2比较,df1中空缺的地方置为0,然后二者相加


# In[ ]:


df1    #自身未变


# In[ ]:


df2


# In[ ]:
Ejemplo n.º 39
0
from pandas import Series, DataFrame
import numpy as np
import pandas as pd

s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
print(s1 + s2)

df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
print(df1)
print('')
print(df2)

print('without fill value')
print(df1.add(df2))

print('with fill value')
print(df1.add(df2, fill_value=0))

# set fill value when reindexing
print(df1.reindex(columns=df2.columns, fill_value=0))

Ejemplo n.º 40
0
print obj5["one"]
print obj5[:2]
obj5[obj5 < 5] = 3
print obj5
print obj5.ix["Ohio", ["one", "two"]]

s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])

print s1 + s2

df1 = DataFrame(np.arange(9).reshape((3, 3)), columns=list("bcd"), index=["Ohin", "Texa", "Colorado"])
df2 = DataFrame(np.arange(12).reshape((4, 3)), columns=list("bcd"), index=["Utah", "Ohin", "Texa", "Colorado"])

print df1 + df2
print df1.add(df2, fill_value=0)

series2 = df2.ix[0]

print df2 - series2

ff = lambda x: x.max() - x.min()

print df2.apply(ff)
print df2.apply(ff, axis=1)

df3 = DataFrame(np.random.randn(3, 3), columns=list("bcd"), index=["Ohin", "Texa", "Colorado"])
ff2 = lambda x: "%.2f" % x
print df3
print df3.applymap(ff2)
print df3
Ejemplo n.º 41
0
class MultiFactor:
    def __init__(self, factor_name, stocks, start_date=None, end_date=None):
        self.factor_name = factor_name
        self.start_date = start_date
        self.end_date = end_date
        self.stocks = stocks
        self.factor = None
        self.factor_list = None
        self.method = None
        self.quantile_nl = None

    def set_factor(self):
        self.factor_list = None
        self.method = None
        self.quantile_nl = None

    def get_factor(self):
        self.factor_dict = {
            factor:
            pd.read_csv('%s/Data/%s.csv' % (gc.FACTORBASE_PATH, factor),
                        index_col=[0])
            for factor in self.factor_list
        }
        self.df = DataFrame({
            factor: self.factor_dict[factor].values.reshape(-1)
            for factor in self.factor_list
        })
        self.corr = self.df.corr()
        self.e_value, self.e_vector = np.linalg.eig(self.corr)
        r = np.array(len(self.e_value) - rankdata(self.e_value),
                     dtype=np.int32)
        self.e_value = self.e_value[r]
        self.e_vector = self.e_vector[:, r]

    def pairplot(self):
        plt.figure(figsize=(16, 12))
        sns.pairplot(self.df)
        plt.savefig('%s/Results/%s/pair.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name))

    def corrplot(self):
        plt.figure(figsize=(16, 12))
        sns.heatmap(self.corr)
        plt.savefig('%s/Results/%s/corr.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name))

    def screeplot(self):
        plt.figure(figsize=(16, 12))
        plt.plot(self.e_value / self.e_value.sum())
        plt.savefig('%s/Results/%s/scree.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name))

    def multi_analysis(self):
        if not os.path.exists('%s/Results/%s' %
                              (gc.MULTIFACTOR_PATH, self.factor_name)):
            os.mkdir('%s/Results/%s' % (gc.MULTIFACTOR_PATH, self.factor_name))
        self.pairplot()
        self.corrplot()
        self.screeplot()

    def combine_factor(self):
        self.factor = DataFrame()
        if self.method == 'ew':
            for factor in self.factor_list:
                self.factor = self.factor.add(self.factor_dict[factor],
                                              fill_value=0)
        elif self.method[:4] == 'pca_':
            pca_num = int(self.method[4])
            for i in range(len(self.factor_list)):
                self.factor = self.factor.add(
                    self.e_vector[i, pca_num] *
                    self.factor_dict[self.factor_list[i]],
                    fill_value=0)
        if self.quantile_nl:
            self.factor = self.factor.subtract(self.factor.quantile(
                self.quantile_nl, axis=1),
                                               axis=0)**2

    def inf_to_nan(self, factor):
        factor[factor == np.inf] = np.nan
        factor[factor == -np.inf] = np.nan
        return factor

    def factor_analysis(self,
                        industry_neutral=True,
                        size_neutral=True,
                        num_group=10):
        self.factor = self.inf_to_nan(self.factor)
        stocks = self.stocks
        start_date = self.start_date
        end_date = self.end_date
        y1 = pd.read_csv('%s/Data/y1.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y2 = pd.read_csv('%s/Data/y2.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y3 = pd.read_csv('%s/Data/y3.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y4 = pd.read_csv('%s/Data/y4.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y5 = pd.read_csv('%s/Data/y5.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]

        if start_date:
            y1 = y1.loc[y1.index >= start_date, :]
            y2 = y2.loc[y2.index >= start_date, :]
            y3 = y3.loc[y3.index >= start_date, :]
            y4 = y4.loc[y4.index >= start_date, :]
            y5 = y5.loc[y5.index >= start_date, :]

        if end_date:
            y1 = y1.loc[y1.index <= end_date, :]
            y2 = y2.loc[y2.index <= end_date, :]
            y3 = y3.loc[y3.index <= end_date, :]
            y4 = y4.loc[y4.index <= end_date, :]
            y5 = y5.loc[y5.index <= end_date, :]

        self.y1 = y1
        self.y2 = y2
        self.y3 = y3
        self.y4 = y4
        self.y5 = y5

        if not os.path.exists(
                '%s/Results/%s/%s' %
            (gc.MULTIFACTOR_PATH, self.factor_name, self.method)):
            os.mkdir('%s/Results/%s/%s' %
                     (gc.MULTIFACTOR_PATH, self.factor_name, self.method))
        factor = self.factor.copy()

        #行业中性
        if industry_neutral:
            industrys = tools.get_industrys('L1', self.stocks)
            tmp = {}
            for k in industrys.keys():
                if len(industrys[k]) > 0:
                    tmp[k] = industrys[k]
            industrys = tmp
            factor = tools.standardize_industry(self.factor, industrys)
            self.factor_industry_neutral = factor.copy()

        #市值中性
        if size_neutral:
            market_capitalization = DataFrame({
                stock:
                pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv' %
                            (gc.DATABASE_PATH, stock),
                            index_col=[0],
                            parse_dates=[0]).loc[:, 'TOTMKTCAP']
                for stock in self.stocks
            })
            market_capitalization = np.log(market_capitalization)
            if self.start_date:
                market_capitalization = market_capitalization.loc[
                    market_capitalization.index >= self.start_date, :]
            if self.end_date:
                market_capitalization = market_capitalization.loc[
                    market_capitalization.index <= self.end_date, :]
            if industry_neutral:
                market_capitalization = tools.standardize_industry(
                    market_capitalization, industrys)
            beta = (factor * market_capitalization).sum(1) / (
                market_capitalization * market_capitalization).sum(1)
            factor = factor - market_capitalization.mul(beta, axis=0)
            self.factor_industry_size_neutral = factor.copy()

        # self.factor_industry_neutral.fillna(0, inplace=True)
        # self.factor_industry_size_neutral.fillna(0, inplace=True)
        # factor.fillna(0, inplace=True)
        #因子分布
        plt.figure(figsize=(16, 12))
        plt.hist(factor.fillna(0).values.flatten())
        plt.savefig('%s/Results/%s/%s/hist.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name, self.method))

        #IC、IR、分组回测
        ys = [self.y1, self.y2, self.y3, self.y4, self.y5]
        IC = {}
        IR = {}
        group_backtest = {}
        group_pos = {}

        for i in range(len(ys)):
            if industry_neutral:
                y_neutral = tools.standardize_industry(ys[i], industrys)
            if size_neutral:
                y_neutral = y_neutral - market_capitalization.mul(
                    (y_neutral * market_capitalization).sum(1) /
                    (market_capitalization * market_capitalization).sum(1),
                    axis=0)
            IC[i] = (y_neutral *
                     factor).mean(1) / factor.std(1) / y_neutral.std(1)
            IR[i] = IC[i].rolling(20).mean() / IC[i].rolling(20).std()
            factor_quantile = DataFrame(
                rankdata(factor, axis=1),
                index=factor.index,
                columns=factor.columns).div(factor.notna().sum(1),
                                            axis=0)  # / len(factor.columns)
            factor_quantile[factor.isna()] = np.nan
            group_backtest[i] = {}
            group_pos[i] = {}
            for n in range(num_group):
                group_pos[i][n] = DataFrame((n / num_group <= factor_quantile)
                                            & (factor_quantile <=
                                               (n + 1) / num_group))
                group_pos[i][n][~group_pos[i][n]] = np.nan
                group_pos[i][n] = 1 * group_pos[i][n]
                group_backtest[i][n] = ((group_pos[i][n] * ys[i]).mean(1) -
                                        ys[i].mean(1)).cumsum().rename(
                                            '%s' % (n / num_group))
        self.IC = IC
        self.IR = IR
        self.group_pos = group_pos
        self.group_backtest = group_backtest

        plt.figure(figsize=(16, 12))
        for i in range(len(ys)):
            IC[i].cumsum().plot()
        plt.legend(['%s' % i for i in range(len(ys))])
        plt.savefig('%s/Results/%s/%s/IC.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name, self.method))

        plt.figure(figsize=(16, 12))
        for i in range(len(ys)):
            IR[i].cumsum().plot()
        plt.legend(['%s' % i for i in range(len(ys))])
        plt.savefig('%s/Results/%s/%s/IR.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name, self.method))

        for i in range(len(ys)):
            plt.figure(figsize=(16, 12))
            for n in range(num_group):
                group_backtest[i][n].plot()
            plt.legend(['%s' % i for i in range(num_group)])
            plt.savefig(
                '%s/Results/%s/%s/groupbacktest%s.png' %
                (gc.MULTIFACTOR_PATH, self.factor_name, self.method, i))

    def update_factor(self):
        self.set_factor()
        self.get_factor()
        self.combine_factor()
        self.factor = self.inf_to_nan(self.factor)
        #if 'industry' in self.neutral_list:
        if True:
            industrys = tools.get_industrys('L1', self.stocks)
            tmp = {}
            for k in industrys.keys():
                if len(industrys[k]) > 0:
                    tmp[k] = industrys[k]
            industrys = tmp
            factor = tools.standardize_industry(self.factor, industrys)
        #if 'market_capitalization' in self.neutral_list:
        if True:
            market_capitalization = DataFrame({
                stock:
                pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv' %
                            (gc.DATABASE_PATH, stock),
                            index_col=[0],
                            parse_dates=[0]).loc[:, 'TOTMKTCAP']
                for stock in self.stocks
            })
            market_capitalization = np.log(market_capitalization)
            if self.start_date:
                market_capitalization = market_capitalization.loc[
                    market_capitalization.index >= self.start_date, :]
            if self.end_date:
                market_capitalization = market_capitalization.loc[
                    market_capitalization.index <= self.end_date, :]
            #if 'industry' in self.neutral_list:
            if True:
                market_capitalization = tools.standardize_industry(
                    market_capitalization, industrys)
            beta = (factor * market_capitalization).sum(1) / (
                market_capitalization * market_capitalization).sum(1)
            factor = factor - market_capitalization.mul(beta, axis=0)
        # factor.fillna(0, inplace=True)
        if os.path.exists('%s/Data/%s.csv' %
                          (gc.FACTORBASE_PATH, self.factor_name)):
            if isinstance(factor.index[0], str):
                factor_old = pd.read_csv(
                    '%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name),
                    index_col=[0])
            else:
                factor_old = pd.read_csv(
                    '%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name),
                    index_col=[0],
                    parse_dates=[0])
            factor = pd.concat([
                factor_old, factor.loc[factor.index > factor_old.index[-1], :]
            ],
                               axis=0)
            factor.sort_index(axis=0, inplace=True)
        factor.sort_index(axis=1, inplace=True)
        factor.to_csv('%s/Data/%s.csv' %
                      (gc.FACTORBASE_PATH, self.factor_name))
Ejemplo n.º 42
0
Test_Dataset = []
for i in test_person:
    X = Dataset_originall[Dataset_originall['subject#'] ==
                          i]  #subjectが1(1番の人)全体をdataframeそのものとして取り出す
    X1 = [X.iloc[:, 4].values, X.iloc[:, 5].values,
          X.iloc[:, 6:].values]  #取り出したDataFrameからラベルyと特徴量Xを取り出す
    Test_Dataset.append(X1)

#
#次に各人のデータを区別せずにまとめて入れます
#
training_person_set = set(training_person)

TRAININGDATA = DataFrame()
for i in training_person:
    TRAININGDATA = TRAININGDATA.add(
        Dataset_originall[Dataset_originall['subject#'] == i], fill_value=0)
X_training = TRAININGDATA.iloc[:, 6:].values
y1_training = TRAININGDATA.iloc[:, 4].values
y2_training = TRAININGDATA.iloc[:, 5].values

TESTDATA = DataFrame()
for i in test_person:
    TESTDATA = TESTDATA.add(
        Dataset_originall[Dataset_originall['subject#'] == i], fill_value=0)
X_test = TESTDATA.iloc[:, 6:].values
y1_test = TESTDATA.iloc[:, 4].values
y2_test = TESTDATA.iloc[:, 5].values


#特徴量の標準化 本来ならこうすべきだが、今回は全体の標準化量を計算する
def Scaler(X, mean, variance):
Ejemplo n.º 43
0
def updateHeatmap(severity, weekdays, time, age):
    # The rangeslider is selects inclusively, but a python list stops before the last number in a range
    hours = [i for i in range(time[0], time[1] + 1)]
    # Take a copy of the dataframe, filtering it and grouping
    acc2 = DataFrame(acc[['発生曜日', '発生時', '死亡', '重傷',
                          '軽傷']][(acc['甲_年齢'].isin(age))])

    acc2_hmap = DataFrame(index=weekdays, columns=hours)
    for sev in severity:
        acc2_pivot = pivot_table(data=acc2,
                                 values=sev,
                                 index='発生曜日',
                                 columns='発生時',
                                 aggfunc='sum')
        try:
            acc2_sum = DataFrame(acc2_pivot.loc[weekdays, hours],
                                 index=weekdays,
                                 columns=hours)
        except:
            acc2_sum = DataFrame(index=weekdays, columns=hours)
        acc2_hmap = acc2_hmap.add(acc2_sum, fill_value=0).fillna(0)

    # Apply text after grouping
    def heatmapText(day, time, sr_hmap):
        FORMAT = '{}曜日  {:02d}時台<br>死傷者数: {:.0f}人'
        t_list = []
        for row in zip(range(time[0], time[1] + 1), sr_hmap):
            t_list.append(FORMAT.format(day, row[0], row[1]))
        return t_list

    # Pre-sort a list of days to feed into the heatmap
    days = sorted(weekdays, key=lambda k: DAYSORT[k])

    # Create the z-values and text in a nested list format to match the shape of the heatmap
    z = []
    text = []
    for d in days:
        row = acc2_hmap.loc[d]
        t = heatmapText(d, time, acc2_hmap.loc[d])
        z.append(row)
        text.append(t)

    # Plotly standard 'Electric' colourscale is great, but the maximum value is white, as is the
    #  colour for missing values. I set the maximum to the penultimate maximum value,
    #  then spread out the other. Plotly colourscales here: https://github.com/plotly/plotly.py/blob/master/plotly/colors.py

    Electric = [[0, 'rgb(0,0,0)'], [0.25, 'rgb(30,0,100)'],
                [0.55, 'rgb(120,0,100)'], [0.8, 'rgb(160,90,0)'],
                [1, 'rgb(230,200,0)']]

    # Heatmap trace
    traces = [{
        'type': 'heatmap',
        'x': hours,
        'y': days,
        'z': z,
        'text': text,
        'hoverinfo': 'text',
        'colorscale': Electric,
    }]

    fig = {
        'data': traces,
        'layout': {
            'paper_bgcolor': 'white',
            'font': {
                'color': 'black'
            },
            'height': 300,
            'title': '曜日・時間帯別の事故状況',
            'margin': {
                'b': 25,
                'l': 30,
                't': 70,
                'r': 0,
            },
            'xaxis': {
                'ticktext':
                hours,  # for the tickvals and ticktext with one for each hour
                'tickvals': hours,
                'tickmode': 'array',
            }
        }
    }
    return fig
Ejemplo n.º 44
0
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
ser_a=Series([100,200,300],index=['a','b','c'])
ser_b=Series([300 ,400,500,600], index=['a','b','c','d'])

#sum of series
print(ser_a+ser_b)

#dataframe
df1=DataFrame(np.arange(4).reshape(2,2),columns=['a','b'],index=['cars','bike'])
print(df1)
df2=DataFrame(np.arange(9).reshape(3,3),columns=['a','b','c'],index=['cars','bike','cycle'])
print(df2)
print(df1+df2)

df1=df1.add(df2,fill_value=0)
print(df1)

ser_c=df2.iloc[0]
print(ser_c)
print(df2-ser_c)
Ejemplo n.º 45
0
 def _cumulative_returns(returns: pd.DataFrame, is_log: bool):
     return returns.cumsum() if is_log else returns.add(1).cumprod().sub(1)
dframe2 = DataFrame(np.arange(9).reshape((3, 3)),
                    columns=list('ABC'),
                    index=['skyrim', 'asgard', 'bumi'])
dframe2

# In[8]:

dframe1 + dframe2

# In[9]:

dframe1

# In[12]:

dframe1.add(dframe2, fill_value=0)

# In[13]:

dframe1

# In[15]:

ser3 = dframe2.ix[0]
ser3

# In[16]:

dframe2 - ser3

# In[ ]:
Ejemplo n.º 47
0
# print b
# print c
# print d
# print e
# print g
# print h
# print i

# 获取行数
len(f.index)



# 运算  DataFrame同理
s1 = Series(np.arange(10,20),index=np.arange(0,10))
s2 = Series(np.arange(50,60),index=np.arange(5,15))
s3 = s1 + s2
# 对原本没有的值进行填充
s4 = s1.add(s2,fill_value = 0)
# print s3
# print s4

# Series 与 DataFrame的运算
s1 = f.ix[0,:]
# DataFrame会每行都根据索引减去series中相应的值
f1 = f - s1
# print f1
# 若需要按列运算,需指定axis轴
s1 = f.ix[:,0]
f1 = f.add(s1,axis = 0)
print f1