def test_combine_series(self): s = self.panel['ItemA'][:10] result = self.panel.add(s, axis=0) expected = DataFrame.add(self.panel, s, axis=0) assert_frame_equal(result, expected) s = self.panel.ix[5] result = self.panel + s expected = DataFrame.add(self.panel, s, axis=1) assert_frame_equal(result, expected)
def test_fill_value_when_combine_const(self): # GH12723 dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float') df = DataFrame({'foo': dat}, index=range(6)) exp = df.fillna(0).add(2) res = df.add(2, fill_value=0) assert_frame_equal(res, exp)
def main(): begin_date = '20200401' end_date = '20210125' factors = ['Beta', 'ChipsCV', 'Close', 'CloseToAverage', 'HK', 'Jump', 'MC', 'MCNL', 'MomentumInd', 'RQPM', 'Sigma', 'Skew', 'TurnRate', 'Reversal', 'Value'] #获取股票超额收益的预测值 IC_hat = pd.read_csv('%s/Results/IC_hat.csv'%gc.IC_PATH, index_col=[0], parse_dates=[0]) IC_hat = IC_hat.loc[IC_hat.index>begin_date, :] IC_hat = IC_hat.loc[IC_hat.index<end_date, :] y = pd.read_csv('%s/Data/y1.csv'%gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]) y = y.loc[y.index>begin_date, :] y = y.loc[y.index<end_date, :] y_hat = DataFrame(0, index=y.index, columns=y.columns) for factor in factors: factor_df = pd.read_csv('%s/Data/%s.csv'%(gc.FACTORBASE_PATH, factor), index_col=[0], parse_dates=[0]) factor_df = factor_df.loc[factor_df.index>begin_date, :] factor_df = factor_df.loc[factor_df.index<end_date, :] y_hat = y_hat.add(factor_df.mul(IC_hat.loc[:, factor], axis=0), fill_value=0) stock_num = 20 turn_rate = 0.2 trade_num = int(stock_num * turn_rate) df_position = DataFrame(index=y.index, columns=list(range(stock_num))) df_position.iloc[0, :] = list(y_hat.iloc[0, :].sort_values(ascending=False).iloc[:stock_num].index) df_pnl = DataFrame(0, index=y.index, columns=list(range(stock_num))) pre_date = df_position.index[0] for date in df_position.index[1:]: pre_position = list(df_position.loc[pre_date, :]) position = list(y_hat.loc[date, pre_position].sort_values(ascending=False).dropna().iloc[:(stock_num-trade_num)].index) stocks = y_hat.loc[date, :].sort_values(ascending=False).index for stock in stocks: if stock not in position: if pd.notna(y.loc[date, stock]): position.append(stock) if len(position) >= stock_num: break position.sort() df_position.loc[date, :] = position df_pnl.loc[date, :] = y.loc[date, position].values pre_date = date pnl = df_pnl.mean(1) df_position.to_csv('%s/Results/df_position.csv'%gc.BACKTEST_PATH) df_pnl.to_csv('%s/Results/df_pnl.csv'%gc.BACKTEST_PATH) pnl.to_csv('%s/Results/pnl.csv'%gc.BACKTEST_PATH) plt.figure(figsize=(16,12)) pnl.cumsum().plot() y.mean(1).cumsum().plot() (pnl - y.mean(1)).cumsum().plot() plt.legend(['PNL', 'BENCHMARK', 'ALPHA']) plt.savefig('%s/Results/backtest.png'%gc.BACKTEST_PATH)
def get_no_rebalancing_port_daily_value_df(weight_series: pd.Series, daily_return_df: pd.DataFrame) -> pd.DataFrame: """ 리밸런싱이 없는 포트폴리오 자산 value 시작 value 는 합산 1 """ assert weight_series.sum() == 1 initial_value_series = weight_series daily_value_df = daily_return_df.add(1).cumprod().multiply(initial_value_series) return daily_value_df
def sens_to_zero_rates(contract, market, curve_ccy, rate_key, reporting_ccy): """Sensitivity of each cashflow to the curve specified by currency and key A leg that pays IBOR is sensitive to both the discount and tenor curve of the currency in which the cash flows (coupons) are paid. """ df_sens = DataFrame(columns=['ttm', 'sens', 'ccy', 'curve']) if curve_ccy == contract.currency: forwards = ibor_rate(contract, market) # replace rate with forwards for any fixing date after valuation date a = contract.frame a.rate = a.rate.where(a.fixing < market.dt_valuation, forwards) zcb_pay = market.discount_factor(a.pay, currency=contract.currency) if rate_key == 'discount': unpaid = a.pay >= market.dt_valuation crv = market.discount_curve(curve_ccy) pay_dates = a.pay[unpaid] ttm_pay = crv.daycount_fn(market.dt_valuation, pay_dates) sens = -ttm_pay * (zcb_pay * a.notional * a.rate * a.period).loc[unpaid] if contract.notl_exchange and unpaid.any(): sens.iloc[-1] += a.notional.iloc[-1] if reporting_ccy != contract.currency: sens *= market.fx(reporting_ccy, contract.currency) df_sens = DataFrame({'ttm': ttm_pay, 'sens': sens, 'ccy': curve_ccy, 'curve': 'discount'}) elif rate_key == contract.frequency: # TODO - Review and add comments crv, crv_key = market.curve(contract.currency, contract.frequency) unfixed = a.fixing >= market.dt_valuation pay_dates = a.pay.loc[unfixed] ttm_pay = crv.daycount_fn(market.dt_valuation, pay_dates) zcbi_pay = crv.discount_factor(pay_dates) fix_dates = a.fixing.loc[unfixed] ttm_fix = crv.daycount_fn(market.dt_valuation, fix_dates) zcbi_fix = crv.discount_factor(contract.frame.fixing) scale_factor = zcbi_fix / zcbi_pay * (a.notional * zcb_pay).loc[unfixed] sens_pay = ttm_pay * scale_factor sens_fix = -ttm_fix * scale_factor if reporting_ccy != contract.currency: fx = market.fx(reporting_ccy, contract.currency) sens_pay *= fx sens_fix *= fx df_pay = DataFrame({'ttm': ttm_pay, 'sens': sens_pay}).set_index('ttm') df_fix = DataFrame({'ttm': ttm_fix, 'sens': sens_fix}).set_index('ttm') df_sens = df_pay.add(df_fix, fill_value=0) df_sens['ttm'] = df_sens.index df_sens['ccy'] = curve_ccy df_sens['curve'] = crv_key return df_sens
def combine(self, datas): count = DataFrame(np.zeros(len(datas[0])), index=datas[0].index) eps = 1.0e-8 col_names = [] for data in datas: for col in data.columns: d = data[col] m = d.mean() s = d.std() if s < eps: continue d = d.map(lambda x: self.is_outlier(x, m, s)) col_names.append(col) count = count.add(d, axis=0) count.columns = [','.join(col_names)] return count
def add_noise(df: DataFrame, a=-1, b=1, method='add') -> DataFrame: """ Add noise to all rows in a DataFrame :param df: The DataFrame we want to add noise to :param a: lower limit of random number that we'll use :param b: upper limit of random number that we'll use :param method: 'add' or 'mul'. The method used to add noise: add or multiply every value with a random value :return: A new DataFrame with random numbers between 'a' and 'b' added to every row """ import numpy as np rand_vector = (b - a) * np.random.random_sample( (len(df), len(df.columns))) + a print(rand_vector) if method == 'add': return df.add(rand_vector, axis=0) if method == 'mul': return df.mul(rand_vector, axis=0) raise AssertionError("Method must be either 'add' or 'mul'")
def sum_boxscores(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame: """Adds two dataframes. It won't sum the `'number'` column. :param df1: First dataframe to add. :param df2: Second dataframe to add. :return: The sum of the dataframes. """ numbers1 = df1.loc[:, "number"] numbers2 = df2.loc[:, "number"] dorsales = numbers1.combine(numbers2, lambda x, y: x if pd.isna(y) else y) minutes1 = df1.loc[:, "minutes"] minutes2 = df2.loc[:, "minutes"] minutes_sum = minutes1.add(minutes2, fill_value=pd.to_timedelta(0.0)) df1 = df1.drop("number", axis="columns") df2 = df2.drop("number", axis="columns") df1 = df1.drop("minutes", axis="columns") df2 = df2.drop("minutes", axis="columns") df_sum = df1.add(df2, fill_value=0) df_sum.loc[:, "number"] = dorsales df_sum.loc[:, "minutes"] = minutes_sum return df_sum
import numpy as np from pandas import DataFrame from pandas import Series data = np.arange(100,80,-1) s = Series(data) dic = { 'line01':np.arange(20,40), 'line03':np.linspace(30,35,20), 'line02':np.arange(80,60,-1) } f = DataFrame(dic,index=np.arange(100,80,-1)) f2 = DataFrame({'line03':np.linspace(30,35,10),'line04':np.arange(10)},index=np.arange(100,90,-1)); f3 = f.add(f2) # 求和 # 求每一列的和 sum = f.sum() # 求指定列的和 sum1 = f[['line01','line02']].sum() # 求每一行的和 sum2 = f.sum(axis=1) # print sum # print sum1 # print sum2 # 若该行或列有NaN则运行结果为NaN,默认skipna为True,忽略NaN sum4 = f3.sum(skipna=False) # print sum4 # 获取所有值
DataFrame 기본 연산 ''' from pandas import DataFrame import numpy as np # DataFrame 생성 frame1 = DataFrame(np.arange(0,9).reshape(3,3), columns=list('abc')) frame2 = DataFrame(np.arange(1,10).reshape(3,3), columns=list('abc')) print(frame1) print(frame2) # frame 덧셈 add = frame1.add(frame2) print(add) # frame 뺄셈 sub = frame2.sub(frame1) print(sub) # frame 나눗셈 div = frame2 / frame1 div = frame2.div(frame1) print(div) # inf : 부모가 0인 경우 # frame 곱셈 mul = frame1.mul(frame2) print(mul) # 행/열 단위 합계/평균/최댓값/최솟값
test_person=list(test_person) test_person.sort() Test_Dataset=[] for i in test_person: X=Dataset_originall[Dataset_originall['subject#']==i]#subjectが1(1番の人)全体をdataframeそのものとして取り出す X1=[X.iloc[:,4].values,X.iloc[:,5].values,X.iloc[:,6:].values]#取り出したDataFrameからラベルyと特徴量Xを取り出す Test_Dataset.append(X1) # #次に各人のデータを区別せずにまとめて入れます # training_person_set=set(training_person) TRAININGDATA=DataFrame() for i in training_person: TRAININGDATA =TRAININGDATA.add(Dataset_originall[Dataset_originall['subject#']==i],fill_value=0) X_training=TRAININGDATA.iloc[:,6:].values y1_training=TRAININGDATA.iloc[:,4].values y2_training=TRAININGDATA.iloc[:,5].values TESTDATA=DataFrame() for i in test_person: TESTDATA =TESTDATA.add(Dataset_originall[Dataset_originall['subject#']==i],fill_value=0) X_test=TESTDATA.iloc[:,6:].values y1_test=TESTDATA.iloc[:,4].values y2_test=TESTDATA.iloc[:,5].values #特徴量の標準化 本来ならこうすべきだが、今回は全体の標準化量を計算する def Scaler(X,mean,variance): return (X-mean)/variance
def main(): # reindex obj = Series(range(4), index="a b c d".split(" ")[::-1]) print obj obj2 = obj.reindex("a b c d e".split(" ")) print obj2 # Change NaN print obj.reindex("a b c d e".split(" "), fill_value=0) colors = ["blue", "purple", "yellow"] index = [0, 2, 4] obj3 = Series(colors, index=index) print obj3.reindex(range(6)) print obj3.reindex(range(6), method="ffill") # not found forward fill print obj3.reindex(range(6), method="backfill") # bfill # DataFrame states = ["Ohio", "Texas", "California"] frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"]) print frame frame2 = frame.reindex("a b c d".split(" ")) print frame2 states[0] = "Utah" states[1], states[0] = states[:2] print frame.reindex(columns=states) # fill print frame.reindex("a b c d".split(" "), method="ffill", columns=states) print frame.ix["a b c d".split(" ")] print frame.ix["a b c d".split(" "), states] # Delete column print "", "" obj = Series(range(5), index="a b c d e".split(" ")) new_obj = obj.drop("c") print new_obj print obj # Index reference print "", "" obj = Series(np.arange(4.0), index="a b c d".split(" ")) print obj["b"] print obj[1] # same print obj[2:4] print obj[["b", "a", "c"]] print obj[[1, 3]] print obj[obj < 2] # Slice with label print obj["b":"c"] # include 'c' obj["b":"c"] = 5 print obj data = DataFrame( np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"], ) print data # column print data["two"] print data[["three", "one"]] # row print data[:2] print data[data["three"] > 5] # all values print data < 5 data[data < 5] = 0 print data # row and column print data.ix[["Colorado"], ["two", "three"]] print data.ix[["Colorado", "Utah"], [3, 0, 1]] # row print data.ix[2] # label row and column, return column print data.ix[:"Utah", "two"] # xs # row print data.xs("Utah") print data.xs("Utah", axis=0) # rows print data.xs("two", axis=1) # icol/irow i is index print data.icol(1) print data.irow(1) # Union print "", "" s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"]) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"]) print s1 print s2 # index is union, but d, f, g are NaN print s1 + s2 df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"]) df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print df1 print df2 print df1 + df2 # arithmetic method print "", "" df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd")) df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde")) print df1 print df2 print df1.add(df2, fill_value=0) # reindex has fill_value argument # other arithmetic method are sub/div/mul(ti) # Calculation in a DataFrame and Series print "", "" # subtract from each row. broadcat arr = np.arange(12.0).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) series = frame.ix[0] print frame print series print frame - series series2 = Series(range(3), index=list("bef")) print frame + series2 series3 = frame["d"] series4 = frame.ix[0] print frame print series3 print series4 print frame.sub(series3, axis=0) print frame.sub(series4, axis=1) # apply function and mapping print "", "" frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print frame f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) f = lambda x: Series([x.min(), x.max()], index=["min", "max"]) print frame.apply(f) format = lambda x: "{0:.2f}".format(x) print frame.applymap(format) # frame print frame["e"].map(format) # series # sort and rank print "", "" obj = Series(range(4), index=list("dabc")) print obj print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc")) print frame print frame.sort_index() print frame.sort_index(axis=1) print frame.sort_index(axis=1, ascending=False) # Sorting series print "", "" obj = Series([4, 7, -3, 2]) print obj.order() obj = Series([4, np.nan, 7, np.nan, -3, 2]) print obj.order() print obj.order(ascending=False) # order by multi columns print "", "" frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]}) print frame.sort_index(by=["a", "b"]) # rank print "", "" obj = Series([7, -5, 7, 4, 2, 0, 4]) print obj.rank() # method is average print obj.rank(method="first") # No Duplicates print obj.rank(ascending=False, method="min") print obj.rank(ascending=False, method="max") f1 = DataFrame(obj, columns=["data"]) f2 = DataFrame(obj.rank(), columns=["rank"]) # merge by each index print pd.merge(f1, f2, left_index=True, right_index=True) # Index of the axis with duplicate values print "", "" obj = Series(range(5), index=list("aaabc")) print obj print obj.index.is_unique print obj["a"] print obj["c"] df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd")) print df print df.ix["b"] print df["c"]
print(s1+s2) # 위와 같은 결과 # index 기준으로 연산 수행 # 겹치는 index가 없을경우 NaN으로 반환 print("\n=======================================================") print("Dataframe_operation") print("=======================================================") df1 = DataFrame(np.arange(9).reshape(3,3), columns=list("abc")) print(df1) print("\n") df2 = DataFrame(np.arange(16).reshape(4,4), columns=list("abcd")) print(df2) print("\n") print(df1+df2) print("\n") print(df1.add(df2,fill_value=0)) # df는 coulmn과 index를 모두 고려해야함 # add operation을 쓰면 NaN값 0으로 변환(fill_value) # Operation Types : add, sub, div, mul print("\n=======================================================") print("Series_DataFrame_operation") print("=======================================================") df = DataFrame(np.arange(16).reshape(4,4), columns=list("abcd")) s = Series(np.arange(10,14), index=list("abcd")) print(df+s) # column을 기준으로 broadcasting이 발생 print("\n=======================================================") print("map_apply_lambda") print("=======================================================")
class YieldReader (object): def __init__ (self, yields = None, scale = 1.0): self.masses = [] if yields is None else [ind [0] for ind in yields.index] self.isotopes = [] if yields is None else [Isotope (iso) for iso in yields.columns] self.isotopes.sort () self.yields = DataFrame () if yields is None else yields self.yields = self.yields.fillna (0.0) self.yields *= scale @classmethod def from_file (cls, filename, mass, **kwargs): self = cls () self.add_file (filename, mass, **kwargs) return self @classmethod def from_directory (cls, directory = "yields/wh07/", mass_file = "masses", **kwargs): self = cls () mass_file = open (directory + "/" + mass_file, "r") for line in mass_file: if line == "\n": continue line = line.rstrip ("\n").split (" ") try: self.add_file (directory + "/" + line [1], float (line [0]) * u.solMass, **kwargs) except IndexError: self.yields = self.yields.append (DataFrame ([{"mass": float (line [0]) * u.solMass, "file": directory + "/"}]).set_index (["mass", "file"])) self.masses.append (float (line [0]) * u.solMass) return self @classmethod def combine (cls, yield_readers): self = cls () self.masses = u.Quantity (np.array (np.concatenate ([yr.masses for yr in yield_readers]))) for yr in yield_readers: isotopeArray = yr.isotopes for iso in isotopeArray: if iso not in self.isotopes: self.isotopes.append (iso) self.isotopes.sort () for yr in yield_readers: dataframe = yr.yields self.yields = self.yields.append (dataframe) self.yields = self.yields.fillna (0.0) return self def add_file (self, filename, mass, winds = True, explosions = True, keplerYield = True, totalYieldName = "yieldall", windYieldName = "yieldwind", expYieldName = None, isotopeName = "isotope", table = 1): self.masses.append (mass) if keplerYield: i = fromKeplerYield (filename, table) else: i = 0 result = np.genfromtxt (filename, skip_header = i, names = True, dtype = None) yieldDF = {} yieldDF ["mass"] = mass yieldDF ["file"] = filename for row in result: if row [isotopeName] == "total" or row [isotopeName] == b"total": break isotope = Isotope (row [isotopeName]) if isotope not in self.isotopes: self.isotopes.append (isotope) yieldDF [isotope.string] = 0.0 if winds and explosions and totalYieldName is not None: yieldDF [isotope.string] += float (row [totalYieldName]) else: if winds: yieldDF [isotope.string] += float (row [windYieldName]) if explosions: if expYieldName is None: yieldDF [isotope.string] += float (row [totalYieldName]) - float (row [windYieldName]) else: yieldDF [isotope.string] += row [expYieldName] self.yields = self.yields.append (DataFrame ([yieldDF]).set_index (["mass", "file"])) self.yields = self.yields.fillna (0.0) self.isotopes.sort () def get_yield (self, isotope, massArray = None, tolerance = 0.0001): if isinstance (isotope, Isotope): isotope = isotope.string if isotope not in self.yields: if massArray is None: return u.Quantity ([0.0] * len (self.yields), u.solMass) return u.Quantity ([0.0] * len (massArray), u.solMass) if massArray is None: return u.Quantity (self.yields [isotope], u.solMass) return u.Quantity (self.yields [isotope].iloc [massArray], u.solMass) def get_masses (self): return self.masses def get_keys (self): return [i for i in self.yields.index] def __add__ (self, other): return YieldReader (self.yields.add (other.yields, fill_value = 0.0)) def __mul__ (self, scalar): return YieldReader (self.yields, scalar) __rmul__ = __mul__ def __div__ (self, scalar): return self * (1.0 / scalar) def __getitem__ (self, i): if isinstance (i, slice): return YieldReader (self.yields [i]) return YieldReader (self.yields [i:i+1])
def combine_data(futures_info_a: pd.DataFrame, futures_info_b: pd.DataFrame): """combine futures close price according to the front to later""" if futures_info_b is None or futures_info_b.empty: return futures_info_a return futures_info_a.add(futures_info_b, fill_value=0)
import numpy as np from pandas import DataFrame npdata = np.random.randn(5, 3) columnNames = ['x1', 'x2', 'x3'] data = DataFrame(npdata, columns=columnNames) print('data =') print(data) columnNames = ['x1', 'x2', 'x3'] data2 = DataFrame(np.random.randn(5, 3), columns=columnNames) print('\ndata2 =') print(data2) print('\ndata + data2 = ') print(data.add(data2)) print('\ndata * data2 = ') print(data.mul(data2))
from pandas import Series, DataFrame import numpy as np s1 = Series([5, 6, -1, 2], index=['a', 'c', 'd', 'e']) s2 = Series([3, 4, -1, 2, 7], index=['a', 'c', 'e', 'f', 'g']) print(s1+s2) df1 = DataFrame(np.arange(9).reshape(3, 3), columns=list('bcd'), index=['seoul', 'busan', 'kwangju']) df2 = DataFrame(np.arange(12).reshape(4, 3), columns=list('bde'), index=['Incheon', 'seoul', 'busan', 'suwon']) print(df1) print(df2) print(df1+df2) df3 = DataFrame(np.arange(12).reshape(3, 4), columns=list('abcd')) df4 = DataFrame(np.arange(20).reshape(4, 5), columns=list('abcde')) print(df3) print(df4) print(df3+df4) print(df3.add(df4, fill_value=0)) print(df3.reindex(columns=df4.columns, fill_value=0))
df1 = DataFrame(np.arange(9).reshape(3, 3), columns=list('bcd'), index=['seoul', 'busan', 'kwangju']) df2 = DataFrame(np.arange(12).reshape(4, 3), columns=list('bde'), index=['incheon', 'seoul', 'busan', 'suwon']) print(df1 + df2) # Series나 DataFrame나 같이 겹쳐있는 값이 있다면 연산하고 그 외에는 NaN과 연산하면 NaN이 되는 법칙에 의해 NaN이 된다. df3 = DataFrame(np.arange(12).reshape(3, 4), columns=list('abcd')) df4 = DataFrame(np.arange(20).reshape(4, 5), columns=list('abcde')) print(df3 + df4) print(df3.add(df4, fill_value=0)) # fill_value 속성은 NaN값은 0으로 채우겠다는 의미 # 결론적으로 df4의 값과 0이 더해진 값이 된다. # DataFrame과 Series간의 연산 ## Numpy의 브로드캐스팅과 유사하다 print(df3.reindex(columns=df4.columns, fill_value=0)) arr = np.arange(12, ).reshape(3, 4) print(arr) print(arr[0]) print(arr - arr[0]) #0 1 2 3 - 0 1 2 3 #4 5 6 7 #8 9 10 11
dframe1 = DataFrame(np.arange(4).reshape((2,2)), columns = list('AB'), index = ['NYC','LA']) dframe1 dframe2 = DataFrame(np.arange(9).reshape((3,3)), columns = list('ADC'), index = ['NYC','SF','LA']) dframe2 # adding dataframes dframe1 + dframe2 # only adds where both row and column match, everything else will be null dframe1 dframe1.add(dframe2, fill_value = 0) # doesn't add if row/column combination doesn't exist in either table # operations between Series and DataFrame ser3 = dframe2.ix[0] ser3 dframe2 - ser3
df2 = DataFrame(np.arange(12).reshape((4, 3)), columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon']) print df1 print df2 print df1 + df2 print print '数据填充' df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns = list('abcd')) df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns = list('abcde')) print df1 print print df2 print print df1.add(df2, fill_value = 0) print df1.reindex(columns = df2.columns, fill_value = 0) print print 'DataFrame与Series之间的操作' arr = np.arange(12.).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12).reshape((4, 3)), columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon']) series = frame.ix[0] print frame print series print
s1 = Series([1, 2, 3], index=['a', 'b', 'c']) s2 = Series([4, 5, 6, 7], index=['a', 'b', 'd', 'c']) print(s1) print(s2) print(s1 + s2) print(s1.add(s2)) # 같은 인덱스명이 대응될 때 연산 가능(인덱스가 같아야 한다) print() df1 = DataFrame(np.arange(9.).reshape(3, 3), columns=list('kbs'), index=['서울', '인천', '수원']) print(df1) df2 = DataFrame(np.arange(12.).reshape(4, 3), columns=list('kbs'), index=['서울', '인천', '일산', '수원']) print(df2) print() print(df1 + df2) # 얘는 속성을 쓸 수 없다. print(df1.add(df2)) # 얘는 속성을 쓸 수 있다. print(df1.add(df2, fill_value=0)) # 얘는 속성(ex. fill_value)를 쓸 수 있다. print() print(df1 + df2) print(df1.mul(df2)) # mul : 곱하기 print(df1.mul(df2, fill_value=0)) print() seri = df1.iloc[0] # df1의 1열 모두 출력 print(seri) print(df1 - seri) # broadcasting되서 연산 가능
ser2 = Series([3, 4, 5, 6], index=['A', 'B', 'C', 'D']) ser2 # adding series ser1 + ser2 dframe1 = DataFrame(np.arange(4).reshape((2, 2)), columns=list('AB'), index=['NYC', 'LA']) dframe1 dframe2 = DataFrame(np.arange(9).reshape((3, 3)), columns=list('ADC'), index=['NYC', 'SF', 'LA']) dframe2 # adding dataframes dframe1 + dframe2 # only adds where both row and column match, everything else will be null dframe1 dframe1.add( dframe2, fill_value=0 ) # doesn't add if row/column combination doesn't exist in either table # operations between Series and DataFrame ser3 = dframe2.ix[0] ser3 dframe2 - ser3
data.ix['Colorado',['two','three']] data.ix['Colorado',[3,0,1]] data.ix['Colorado'] data['two'] data.two ## 算数运算对齐 s1 = Series([7.3,-2.5,3.4,1.5], index=['a','b','d','e']) s2 = Series([-2.1,3.6,-1.5,4,3.1], index=['a','c','e','f','g']) s1 + s2 df1 = DataFrame(np.arange(9).reshape((3,3)), columns=list('bcd'), index=['Ohio','Texas','Colorado']) df2 = DataFrame(np.arange(12).reshape((4,3)),columns=list('bde'), index=['Utah','Ohio','Texas','Oregon']) df1 + df2 # 所谓的对齐,就是索引相同的值做运算 # 填充值,可以给对不上的对象填充一个特殊值 df1.add(df2, fill_value=0) # 这个只会填充df2中没有的对象。 ## DataFrame和Series之间的运算 frame = DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=['Utah','Utahs','Texas','Oregon']) series = frame.ix[0] frame - series # 每列都会减对应元素,这种被称为沿着行广播,如果想要沿着列广播,可以如下操作 series3 = frame['d'] series3 = frame.ix['d',:] frame.sub(series3, axis=0) ## 函数的应用和映射 frame = DataFrame(np.random.randn(4,3), columns=list('bde'), index=['Utah','Ohio','Texas','Oregon']) f = lambda x :x.max()-x.min()
frame5[:2] frame5[frame5['three'] > 5] frame5 < 5 #返回布林值 frame5[frame5 < 5] = 0 frame5 #利用frame5<0找出True&False,並在上面的式子設定返回小於5就等於0 frame5.loc['Colorado', ['two', 'three']] frame5.ix[['Ohio', 'Utah'], [3, 0, 1]] #利用ix可以更靈活地找子集 frame5.ix[frame5.three > 7, :3] ##算術運算和數據對齊 #Series跟DataFrame中重疊的行or行列才會合併,為重疊的值就是nan frame6_1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd')) frame6_2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde')) frame6_1 + frame6_2 frame6_1.add(frame6_2, fill_value=0) #傳入frame6_2的數值與一個fill_value參數 frame6_1.reindex(columns=frame6_2.columns, fill_value=0) #再重新索引時,可以指定填充 #DataFrame跟Series之間的運算 yc10 = np.arange(12.).reshape((3, 4)) yc10 yc10 - yc10[1] frame7 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('abc'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) series_7 = frame7.iloc[0] frame7 - series_7 #每一層都會減掉,叫做廣播broadcasting series_7_1 = frame7['b'] frame7.sub(series_7_1, axis=0)
# 模型在测试集的表现 print("In the test data, the model MSE is %g" % sess.run(loss,feed_dict={x:test_x,y_:test_y})) # 模型导出 # writer = tf.summary.FileWriter('log',sess.graph) # 模型持久化 # saver.save(sess,"model/{}.ckpt".format(KEYWORD)) # 保存结果 # 切换换回原来的坐标轴 # 需要train_data,test_data,out_data out_data.index = pd.date_range(start=raw_data.index[0],freq=FREQ,periods=len(out_data)) # 合并原始数据转成字典 df_data1 = DataFrame({'curve':out_data}) df_data2 = DataFrame({'data':train_data}) df_data3 = DataFrame({'test':test_data}) df_data = df_data1.add(df_data2,fill_value=0) df_data = df_data.add(df_data3,fill_value=0) df_data.plot(figsize=(20, 8.5)) # j_data = json.loads(df_data.to_json(orient='split',date_unit='s')) # 添加key,freq,option # j_data['key'] = KEYWORD # j_data['freq']=FREQ # j_data['option']='TensorFlow' # print(j_data) # 保存到MongoDB # predict_col = pymongo.MongoClient('localhost')['scrapy']['predict'] # predict_col.update_one({"key":j_data['key'],"freq":j_data['freq']},{"$set":j_data},upsert=True)
def sens_to_zero_rates(contract, market, curve_ccy, rate_key, reporting_ccy): """Sensitivity of each cashflow to the curve specified by currency and key A leg that pays IBOR is sensitive to both the discount and tenor curve of the currency in which the cash flows (coupons) are paid. """ df_sens = DataFrame(columns=['ttm', 'sens', 'ccy', 'curve']) if curve_ccy == contract.currency: forwards = ibor_rate(contract, market) # replace rate with forwards for any fixing date after valuation date a = contract.frame a.rate = a.rate.where(a.fixing < market.dt_valuation, forwards) zcb_pay = market.discount_factor(a.pay, currency=contract.currency) if rate_key == 'discount': unpaid = a.pay >= market.dt_valuation crv = market.discount_curve(curve_ccy) pay_dates = a.pay[unpaid] ttm_pay = crv.daycount_fn(market.dt_valuation, pay_dates) sens = -ttm_pay * (zcb_pay * a.notional * a.rate * a.period).loc[unpaid] if contract.notl_exchange and unpaid.any(): sens.iloc[-1] += a.notional.iloc[-1] if reporting_ccy != contract.currency: sens *= market.fx(reporting_ccy, contract.currency) df_sens = DataFrame({ 'ttm': ttm_pay, 'sens': sens, 'ccy': curve_ccy, 'curve': 'discount' }) elif rate_key == contract.frequency: # TODO - Review and add comments crv, crv_key = market.curve(contract.currency, contract.frequency) unfixed = a.fixing >= market.dt_valuation pay_dates = a.pay.loc[unfixed] ttm_pay = crv.daycount_fn(market.dt_valuation, pay_dates) zcbi_pay = crv.discount_factor(pay_dates) fix_dates = a.fixing.loc[unfixed] ttm_fix = crv.daycount_fn(market.dt_valuation, fix_dates) zcbi_fix = crv.discount_factor(contract.frame.fixing) scale_factor = zcbi_fix / zcbi_pay * (a.notional * zcb_pay).loc[unfixed] sens_pay = ttm_pay * scale_factor sens_fix = -ttm_fix * scale_factor if reporting_ccy != contract.currency: fx = market.fx(reporting_ccy, contract.currency) sens_pay *= fx sens_fix *= fx df_pay = DataFrame({ 'ttm': ttm_pay, 'sens': sens_pay }).set_index('ttm') df_fix = DataFrame({ 'ttm': ttm_fix, 'sens': sens_fix }).set_index('ttm') df_sens = df_pay.add(df_fix, fill_value=0) df_sens['ttm'] = df_sens.index df_sens['ccy'] = curve_ccy df_sens['curve'] = crv_key return df_sens
data.drop('three',axis=1) #直接 定位到元素 点 用ix data.ix['Colorado',['one','four']] #*************************** #对Series 相加 是对相同索引 的数据相加,没有的值 其和 最终会以NaN 来表示 list('abcd') 等价于 ['a','b','c','d'] df1=DataFrame(np.arange(12).reshape(3,4),columns=list('abcd')) df2=DataFrame(np.arange(20).reshape(4,5),columns=list('abcde')) df1.add(df2,fill_value=0) 对未有值的NaN 以0代替 带入加法 #add #sub #div #mul 加减乘除 #*********************************************** frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon']) frame.abs() #自定义函数 f=lambda x:x.max()-x.min() #用apply 来执行 frame.apply(f) #对列执行 frame.apply(f,axis=1) #对 行执行 #定义函数 def f(x): return Series([x.min(),x.max()],index=['min','nax'])
from pandas import Series, DataFrame import pandas as pd import numpy as np from numpy import nan #导入相应模块 import matplotlib.pyplot as plt #插入数据 df1 = DataFrame(np.arange(12).reshape((3, 4)), columns=list("abcd")) df2 = DataFrame(np.arange(20).reshape((4, 5)), columns=list("abcde")) df1 df2 df3 = df1 + df2 #df1.add(df2) df3.iloc[1, 3] = nan df1.add(df2, fill_value=0) # 为df1添加第3行和e这一列,并将其填充为0 df1.add(df2).fillna(0) # 按照正常方式将df1和df2相加,然后将NaN值填充为0 print(df3) df3.plot() plt.show()
'''6.DataFrame的运算 同Series一样:在运算中自动对齐不同索引的数据如果索引不对应,则补NaN''' # 6.1 DataFrame之间的运算 df1 = DataFrame( { 'Python': [119, 120, 110], '数学': [130, 118, 112], '英语': [90, 137, 99] }, index=['张三', '王五', '李四']) df2 = DataFrame(data=np.random.randint(0, 150, size=(4, 4)), index=['张三', '王五', '李四', 'Michael'], columns=['Python', '数学', '物理', '英语']) df1 + 10 # dataframe的每一个元素会加10! df1 + df2 # dataframe相加会自动对准index和column,两者相同则自动相加,对不上的则自动补齐NAN! df1.add(df2, fill_value=0) # 使用这种方法可以使索引对不上者不会直接返回NAN,而是自动填上0使之进行计算 df2.add(df1, fill_value=0) # 结果与上面完全一样 '''下面是Python 操作符与pandas操作函数的对应表: + add() - sub(), subtract() * mul(), multiply() / truediv(), div(), divide() // floordiv() % mod() ** pow() 求平均值 mean() 求方差 std()''' # 6.2 DataFrame与series的运算!!!---好好理解这里是怎么回事,有一定难度。 df1 = DataFrame( {
# 算数运算与数据对齐 s1 = Series([1.1, 2.2, 3.3], index=['a', 'b', 'c']) s2 = Series([-1.1, -2.2, -3.0, 4.4], index=['a', 'b', 'c', 'd']) s3 = s1 + s2 print(s3) d1 = DataFrame(np.arange(9).reshape((3, 3)), index=[1, 2, 3], columns=list('abc')) d2 = DataFrame(np.arange(12).reshape((4, 3)), index=[1, 2, 3, 4], columns=list('cde')) d3 = d1 + d2 print(d3) d3 = d1.add(d2, fill_value=0) print(d3) # Dataframe与series之间的运算与排序 df1 = DataFrame(np.arange(12).reshape((4, 3)), columns=list('abc'), index=[1, 2, 3, 4]) s1 = Series(df1.loc[1]) print(df1) print(s1) dele = df1 - s1 #广播相减 print(dele) s2 = Series(np.arange(3), index=['c', 'd', 'e']) add1 = df1 + s2 # 不同索引会合并 print(add1)
# -*- coding: utf-8 -*- """""""""""""""""""""""""""""""""""""""""""""""""""""""""" 数据框和向量的计算规则 """""""""""""""""""""""""""""""""""""""""""""""""""""""""" #%% from pandas import DataFrame,Series from string import letters s1=Series(range(3),list(letters[:3])) d1=DataFrame( {'a':range(0,3)}, index=list(letters[:3]) ) d2=DataFrame( {'a':range(0,10),'b':range(10,20),'c':range(20,30)}, index=list(letters[:10]) ) #%% 数据框相加使用的是对齐计算同位置元素 d1+d2 #%% 数据框和系列相加是按列对齐计算同位置元素,并按行扩展 d2+s1 #%% 使用add函数实现按行对齐 #axis表示用哪个label来对齐,而不是广播的方向 d2.add(s1,axis=0) # {0, 1, 'index', 'columns'} #%% d2.add(s1,axis=1)
import numpy as np from pandas import Series, DataFrame import pandas as pd from numpy.random import randn df1 = pd.read_clipboard() #doc khung vua luu df2 = DataFrame(np.arange(16).reshape(4,4), columns = "col1 col2 col3 col4".split(), index = list('ABCD')) print(df1,'\n') print(df1.columns,'\n') #hien thi column #print(df1.head(),'\n') #5 row dau tien print(df1[['col1', 'col3']]) #lay ra gia tri cot print(df1[df1['col4'] > 50]) #lay gia tri col > 50 print(df1 > 50) #kiem tra gia tri print(df1.ix['A']) #lay dong print(df1.drop('B')) #xoa dong print(df1.drop('col3', axis = 1)) #xoa cot print(df2) print(df2.add(df1, fill_value = 0)) #phep cong 2 ma tran print(df1.sum()) #tong moi cot print(df1.sum(axis = 1)) #tong dong print(df1.max()) #max cot print(df1.idxmax()) #lay index de col max
# DataFrame的对齐 # 效果和Series一样,不过是同时对齐行和列 df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', "Colorado"]) df2 = DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) # print df1 # print df2 df1_add_df2 = df1 + df2 # print df1_add_df2 # 填充默认值代替没有出现的值来参与计算 df1_default = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd')) df2_defaule = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde')) # 没有默认值 df1_add_df2_without = df1_default + df2_defaule # 设置默认值,没有的值来参与计算 df1_add_df2_default = df1_default.add(df2_defaule, fill_value=0) # print df1_add_df2_without # print df1_add_df2_default # 对应的算法方法 # add 加法 # sub 减法 # div 除法 # mul 乘法 # DataFrame 和 Series 之间的计算 arr = np.arange(12.).reshape((3, 4)) # broadcasting广播计算 # print arr - arr[0] # DataFrame 与 Series之间的计算与broadcast类似 frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame4 = DataFrame( { 'a': np.random.randn(4), 'b': ['foo', 'lol'] * 2, 'c': [True, False] * 2 }, index=list('abcd')) frame.ix['f'] = np.random.randn(4) frame['loc'] = ['ST', 'MO'] * 3 frame.sort_index(axis=1) frame.sort_values(by=['loc', 'STL']) frame.rank(axis=0) frame.rank(method='max') um.order() um.rank() frame.add(frame2) frame.corr(um) frame.fillna(1, inplace='True') um = frame['UM'] frame.corr() frame.cov() frame2.ix['f'] = np.random.randn(3) frame.corrwith(frame2) frame.corrwith(um) frame.corrwith(um.to_frame()) frame.ix[:, 'Washu':'UMST'].apply(lambda x: x.mean()) frame.set_index('UM', drop=True, inplace=True) keys = frame.index frame.reset_index(level=keys) df = DataFrame(np.random.randn(6, 5),
ser1 = Series([0, 1, 2], index=['A', 'B', 'C']) ser1 ser2 = Series([3, 4, 5, 6], index=['A', 'B', 'C', 'D']) ser2 ser1 + ser2 dframe1 = DataFrame(np.arange(4).reshape(2, 2), columns=list('AB'), index=['NYC', 'LA']) dframe1 dframe2 = DataFrame(np.arange(9).reshape(3, 3), columns=list('ADC'), index=['NYC', 'SF', 'LA']) dframe2 dframe1 + dframe2 dframe1.add(dframe2) ser3 = dframe2.ix[0] ser3 dframe2 - ser3 ser1 = Series(range(3), index=['C', 'A', 'B']) ser1 ser1.sort_index() ser1.sort_values() ser2 = Series(randn(10)) ser2 arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) arr
# In[ ]: df1 # In[ ]: df2 # In[ ]: df1.add(df2, fill_value=0) #与df2比较,df1中空缺的地方置为0,然后二者相加 # In[ ]: df1 #自身未变 # In[ ]: df2 # In[ ]:
from pandas import Series, DataFrame import numpy as np import pandas as pd s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e']) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g']) print(s1 + s2) df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd')) df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde')) print(df1) print('') print(df2) print('without fill value') print(df1.add(df2)) print('with fill value') print(df1.add(df2, fill_value=0)) # set fill value when reindexing print(df1.reindex(columns=df2.columns, fill_value=0))
print obj5["one"] print obj5[:2] obj5[obj5 < 5] = 3 print obj5 print obj5.ix["Ohio", ["one", "two"]] s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"]) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"]) print s1 + s2 df1 = DataFrame(np.arange(9).reshape((3, 3)), columns=list("bcd"), index=["Ohin", "Texa", "Colorado"]) df2 = DataFrame(np.arange(12).reshape((4, 3)), columns=list("bcd"), index=["Utah", "Ohin", "Texa", "Colorado"]) print df1 + df2 print df1.add(df2, fill_value=0) series2 = df2.ix[0] print df2 - series2 ff = lambda x: x.max() - x.min() print df2.apply(ff) print df2.apply(ff, axis=1) df3 = DataFrame(np.random.randn(3, 3), columns=list("bcd"), index=["Ohin", "Texa", "Colorado"]) ff2 = lambda x: "%.2f" % x print df3 print df3.applymap(ff2) print df3
class MultiFactor: def __init__(self, factor_name, stocks, start_date=None, end_date=None): self.factor_name = factor_name self.start_date = start_date self.end_date = end_date self.stocks = stocks self.factor = None self.factor_list = None self.method = None self.quantile_nl = None def set_factor(self): self.factor_list = None self.method = None self.quantile_nl = None def get_factor(self): self.factor_dict = { factor: pd.read_csv('%s/Data/%s.csv' % (gc.FACTORBASE_PATH, factor), index_col=[0]) for factor in self.factor_list } self.df = DataFrame({ factor: self.factor_dict[factor].values.reshape(-1) for factor in self.factor_list }) self.corr = self.df.corr() self.e_value, self.e_vector = np.linalg.eig(self.corr) r = np.array(len(self.e_value) - rankdata(self.e_value), dtype=np.int32) self.e_value = self.e_value[r] self.e_vector = self.e_vector[:, r] def pairplot(self): plt.figure(figsize=(16, 12)) sns.pairplot(self.df) plt.savefig('%s/Results/%s/pair.png' % (gc.MULTIFACTOR_PATH, self.factor_name)) def corrplot(self): plt.figure(figsize=(16, 12)) sns.heatmap(self.corr) plt.savefig('%s/Results/%s/corr.png' % (gc.MULTIFACTOR_PATH, self.factor_name)) def screeplot(self): plt.figure(figsize=(16, 12)) plt.plot(self.e_value / self.e_value.sum()) plt.savefig('%s/Results/%s/scree.png' % (gc.MULTIFACTOR_PATH, self.factor_name)) def multi_analysis(self): if not os.path.exists('%s/Results/%s' % (gc.MULTIFACTOR_PATH, self.factor_name)): os.mkdir('%s/Results/%s' % (gc.MULTIFACTOR_PATH, self.factor_name)) self.pairplot() self.corrplot() self.screeplot() def combine_factor(self): self.factor = DataFrame() if self.method == 'ew': for factor in self.factor_list: self.factor = self.factor.add(self.factor_dict[factor], fill_value=0) elif self.method[:4] == 'pca_': pca_num = int(self.method[4]) for i in range(len(self.factor_list)): self.factor = self.factor.add( self.e_vector[i, pca_num] * self.factor_dict[self.factor_list[i]], fill_value=0) if self.quantile_nl: self.factor = self.factor.subtract(self.factor.quantile( self.quantile_nl, axis=1), axis=0)**2 def inf_to_nan(self, factor): factor[factor == np.inf] = np.nan factor[factor == -np.inf] = np.nan return factor def factor_analysis(self, industry_neutral=True, size_neutral=True, num_group=10): self.factor = self.inf_to_nan(self.factor) stocks = self.stocks start_date = self.start_date end_date = self.end_date y1 = pd.read_csv('%s/Data/y1.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y2 = pd.read_csv('%s/Data/y2.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y3 = pd.read_csv('%s/Data/y3.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y4 = pd.read_csv('%s/Data/y4.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y5 = pd.read_csv('%s/Data/y5.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] if start_date: y1 = y1.loc[y1.index >= start_date, :] y2 = y2.loc[y2.index >= start_date, :] y3 = y3.loc[y3.index >= start_date, :] y4 = y4.loc[y4.index >= start_date, :] y5 = y5.loc[y5.index >= start_date, :] if end_date: y1 = y1.loc[y1.index <= end_date, :] y2 = y2.loc[y2.index <= end_date, :] y3 = y3.loc[y3.index <= end_date, :] y4 = y4.loc[y4.index <= end_date, :] y5 = y5.loc[y5.index <= end_date, :] self.y1 = y1 self.y2 = y2 self.y3 = y3 self.y4 = y4 self.y5 = y5 if not os.path.exists( '%s/Results/%s/%s' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)): os.mkdir('%s/Results/%s/%s' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) factor = self.factor.copy() #行业中性 if industry_neutral: industrys = tools.get_industrys('L1', self.stocks) tmp = {} for k in industrys.keys(): if len(industrys[k]) > 0: tmp[k] = industrys[k] industrys = tmp factor = tools.standardize_industry(self.factor, industrys) self.factor_industry_neutral = factor.copy() #市值中性 if size_neutral: market_capitalization = DataFrame({ stock: pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv' % (gc.DATABASE_PATH, stock), index_col=[0], parse_dates=[0]).loc[:, 'TOTMKTCAP'] for stock in self.stocks }) market_capitalization = np.log(market_capitalization) if self.start_date: market_capitalization = market_capitalization.loc[ market_capitalization.index >= self.start_date, :] if self.end_date: market_capitalization = market_capitalization.loc[ market_capitalization.index <= self.end_date, :] if industry_neutral: market_capitalization = tools.standardize_industry( market_capitalization, industrys) beta = (factor * market_capitalization).sum(1) / ( market_capitalization * market_capitalization).sum(1) factor = factor - market_capitalization.mul(beta, axis=0) self.factor_industry_size_neutral = factor.copy() # self.factor_industry_neutral.fillna(0, inplace=True) # self.factor_industry_size_neutral.fillna(0, inplace=True) # factor.fillna(0, inplace=True) #因子分布 plt.figure(figsize=(16, 12)) plt.hist(factor.fillna(0).values.flatten()) plt.savefig('%s/Results/%s/%s/hist.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) #IC、IR、分组回测 ys = [self.y1, self.y2, self.y3, self.y4, self.y5] IC = {} IR = {} group_backtest = {} group_pos = {} for i in range(len(ys)): if industry_neutral: y_neutral = tools.standardize_industry(ys[i], industrys) if size_neutral: y_neutral = y_neutral - market_capitalization.mul( (y_neutral * market_capitalization).sum(1) / (market_capitalization * market_capitalization).sum(1), axis=0) IC[i] = (y_neutral * factor).mean(1) / factor.std(1) / y_neutral.std(1) IR[i] = IC[i].rolling(20).mean() / IC[i].rolling(20).std() factor_quantile = DataFrame( rankdata(factor, axis=1), index=factor.index, columns=factor.columns).div(factor.notna().sum(1), axis=0) # / len(factor.columns) factor_quantile[factor.isna()] = np.nan group_backtest[i] = {} group_pos[i] = {} for n in range(num_group): group_pos[i][n] = DataFrame((n / num_group <= factor_quantile) & (factor_quantile <= (n + 1) / num_group)) group_pos[i][n][~group_pos[i][n]] = np.nan group_pos[i][n] = 1 * group_pos[i][n] group_backtest[i][n] = ((group_pos[i][n] * ys[i]).mean(1) - ys[i].mean(1)).cumsum().rename( '%s' % (n / num_group)) self.IC = IC self.IR = IR self.group_pos = group_pos self.group_backtest = group_backtest plt.figure(figsize=(16, 12)) for i in range(len(ys)): IC[i].cumsum().plot() plt.legend(['%s' % i for i in range(len(ys))]) plt.savefig('%s/Results/%s/%s/IC.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) plt.figure(figsize=(16, 12)) for i in range(len(ys)): IR[i].cumsum().plot() plt.legend(['%s' % i for i in range(len(ys))]) plt.savefig('%s/Results/%s/%s/IR.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) for i in range(len(ys)): plt.figure(figsize=(16, 12)) for n in range(num_group): group_backtest[i][n].plot() plt.legend(['%s' % i for i in range(num_group)]) plt.savefig( '%s/Results/%s/%s/groupbacktest%s.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method, i)) def update_factor(self): self.set_factor() self.get_factor() self.combine_factor() self.factor = self.inf_to_nan(self.factor) #if 'industry' in self.neutral_list: if True: industrys = tools.get_industrys('L1', self.stocks) tmp = {} for k in industrys.keys(): if len(industrys[k]) > 0: tmp[k] = industrys[k] industrys = tmp factor = tools.standardize_industry(self.factor, industrys) #if 'market_capitalization' in self.neutral_list: if True: market_capitalization = DataFrame({ stock: pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv' % (gc.DATABASE_PATH, stock), index_col=[0], parse_dates=[0]).loc[:, 'TOTMKTCAP'] for stock in self.stocks }) market_capitalization = np.log(market_capitalization) if self.start_date: market_capitalization = market_capitalization.loc[ market_capitalization.index >= self.start_date, :] if self.end_date: market_capitalization = market_capitalization.loc[ market_capitalization.index <= self.end_date, :] #if 'industry' in self.neutral_list: if True: market_capitalization = tools.standardize_industry( market_capitalization, industrys) beta = (factor * market_capitalization).sum(1) / ( market_capitalization * market_capitalization).sum(1) factor = factor - market_capitalization.mul(beta, axis=0) # factor.fillna(0, inplace=True) if os.path.exists('%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name)): if isinstance(factor.index[0], str): factor_old = pd.read_csv( '%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name), index_col=[0]) else: factor_old = pd.read_csv( '%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name), index_col=[0], parse_dates=[0]) factor = pd.concat([ factor_old, factor.loc[factor.index > factor_old.index[-1], :] ], axis=0) factor.sort_index(axis=0, inplace=True) factor.sort_index(axis=1, inplace=True) factor.to_csv('%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name))
Test_Dataset = [] for i in test_person: X = Dataset_originall[Dataset_originall['subject#'] == i] #subjectが1(1番の人)全体をdataframeそのものとして取り出す X1 = [X.iloc[:, 4].values, X.iloc[:, 5].values, X.iloc[:, 6:].values] #取り出したDataFrameからラベルyと特徴量Xを取り出す Test_Dataset.append(X1) # #次に各人のデータを区別せずにまとめて入れます # training_person_set = set(training_person) TRAININGDATA = DataFrame() for i in training_person: TRAININGDATA = TRAININGDATA.add( Dataset_originall[Dataset_originall['subject#'] == i], fill_value=0) X_training = TRAININGDATA.iloc[:, 6:].values y1_training = TRAININGDATA.iloc[:, 4].values y2_training = TRAININGDATA.iloc[:, 5].values TESTDATA = DataFrame() for i in test_person: TESTDATA = TESTDATA.add( Dataset_originall[Dataset_originall['subject#'] == i], fill_value=0) X_test = TESTDATA.iloc[:, 6:].values y1_test = TESTDATA.iloc[:, 4].values y2_test = TESTDATA.iloc[:, 5].values #特徴量の標準化 本来ならこうすべきだが、今回は全体の標準化量を計算する def Scaler(X, mean, variance):
def updateHeatmap(severity, weekdays, time, age): # The rangeslider is selects inclusively, but a python list stops before the last number in a range hours = [i for i in range(time[0], time[1] + 1)] # Take a copy of the dataframe, filtering it and grouping acc2 = DataFrame(acc[['発生曜日', '発生時', '死亡', '重傷', '軽傷']][(acc['甲_年齢'].isin(age))]) acc2_hmap = DataFrame(index=weekdays, columns=hours) for sev in severity: acc2_pivot = pivot_table(data=acc2, values=sev, index='発生曜日', columns='発生時', aggfunc='sum') try: acc2_sum = DataFrame(acc2_pivot.loc[weekdays, hours], index=weekdays, columns=hours) except: acc2_sum = DataFrame(index=weekdays, columns=hours) acc2_hmap = acc2_hmap.add(acc2_sum, fill_value=0).fillna(0) # Apply text after grouping def heatmapText(day, time, sr_hmap): FORMAT = '{}曜日 {:02d}時台<br>死傷者数: {:.0f}人' t_list = [] for row in zip(range(time[0], time[1] + 1), sr_hmap): t_list.append(FORMAT.format(day, row[0], row[1])) return t_list # Pre-sort a list of days to feed into the heatmap days = sorted(weekdays, key=lambda k: DAYSORT[k]) # Create the z-values and text in a nested list format to match the shape of the heatmap z = [] text = [] for d in days: row = acc2_hmap.loc[d] t = heatmapText(d, time, acc2_hmap.loc[d]) z.append(row) text.append(t) # Plotly standard 'Electric' colourscale is great, but the maximum value is white, as is the # colour for missing values. I set the maximum to the penultimate maximum value, # then spread out the other. Plotly colourscales here: https://github.com/plotly/plotly.py/blob/master/plotly/colors.py Electric = [[0, 'rgb(0,0,0)'], [0.25, 'rgb(30,0,100)'], [0.55, 'rgb(120,0,100)'], [0.8, 'rgb(160,90,0)'], [1, 'rgb(230,200,0)']] # Heatmap trace traces = [{ 'type': 'heatmap', 'x': hours, 'y': days, 'z': z, 'text': text, 'hoverinfo': 'text', 'colorscale': Electric, }] fig = { 'data': traces, 'layout': { 'paper_bgcolor': 'white', 'font': { 'color': 'black' }, 'height': 300, 'title': '曜日・時間帯別の事故状況', 'margin': { 'b': 25, 'l': 30, 't': 70, 'r': 0, }, 'xaxis': { 'ticktext': hours, # for the tickvals and ticktext with one for each hour 'tickvals': hours, 'tickmode': 'array', } } } return fig
import numpy as np import pandas as pd from pandas import Series,DataFrame ser_a=Series([100,200,300],index=['a','b','c']) ser_b=Series([300 ,400,500,600], index=['a','b','c','d']) #sum of series print(ser_a+ser_b) #dataframe df1=DataFrame(np.arange(4).reshape(2,2),columns=['a','b'],index=['cars','bike']) print(df1) df2=DataFrame(np.arange(9).reshape(3,3),columns=['a','b','c'],index=['cars','bike','cycle']) print(df2) print(df1+df2) df1=df1.add(df2,fill_value=0) print(df1) ser_c=df2.iloc[0] print(ser_c) print(df2-ser_c)
def _cumulative_returns(returns: pd.DataFrame, is_log: bool): return returns.cumsum() if is_log else returns.add(1).cumprod().sub(1)
dframe2 = DataFrame(np.arange(9).reshape((3, 3)), columns=list('ABC'), index=['skyrim', 'asgard', 'bumi']) dframe2 # In[8]: dframe1 + dframe2 # In[9]: dframe1 # In[12]: dframe1.add(dframe2, fill_value=0) # In[13]: dframe1 # In[15]: ser3 = dframe2.ix[0] ser3 # In[16]: dframe2 - ser3 # In[ ]:
# print b # print c # print d # print e # print g # print h # print i # 获取行数 len(f.index) # 运算 DataFrame同理 s1 = Series(np.arange(10,20),index=np.arange(0,10)) s2 = Series(np.arange(50,60),index=np.arange(5,15)) s3 = s1 + s2 # 对原本没有的值进行填充 s4 = s1.add(s2,fill_value = 0) # print s3 # print s4 # Series 与 DataFrame的运算 s1 = f.ix[0,:] # DataFrame会每行都根据索引减去series中相应的值 f1 = f - s1 # print f1 # 若需要按列运算,需指定axis轴 s1 = f.ix[:,0] f1 = f.add(s1,axis = 0) print f1