def load_data(cls, date, code_list=None): print("Downloading data ...") tuple_str = "(" for s in report_period_generator(period=32, date=date): tuple_str += "'" + s + "'," tuple_str = tuple_str[:-1] tuple_str += ")" sql = ''' SELECT S_INFO_WINDCODE, REPORT_PERIOD, ACTUAL_ANN_DT, STATEMENT_TYPE, NET_PROFIT_EXCL_MIN_INT_INC AS net_income, TOT_OPER_REV AS total_revenue, OPER_REV AS revenue, TOT_OPER_COST AS total_opcost, LESS_OPER_COST AS operating_cost, LESS_SELLING_DIST_EXP AS sale_expense, LESS_GERL_ADMIN_EXP AS management_expense, RD_EXPENSE AS research_expense, LESS_FIN_EXP AS financial_expense, OPER_PROFIT AS operating_profit FROM wind.ASHAREINCOME WHERE SUBSTR(S_INFO_WINDCODE, 1, 1) != 'A' AND STATEMENT_TYPE in (408001000, 408004000, 408005000, 408050000) AND REPORT_PERIOD in {0} AND ACTUAL_ANN_DT <= {1} '''.format(tuple_str, date) df = db.query_by_SQL("wind", sql) return df, date
def load_data(cls, date, code_list=None): print("Downloading data ...") tuple_str = "(" for s in report_period_generator(period=32, date=date): tuple_str += "'" + s + "'," tuple_str = tuple_str[:-1] tuple_str += ")" sql = ''' SELECT S_INFO_WINDCODE, REPORT_PERIOD, ACTUAL_ANN_DT, STATEMENT_TYPE, NET_CASH_FLOWS_OPER_ACT AS operating_cashflow, STOT_CASH_INFLOWS_OPER_ACT AS operating_cashinflow, STOT_CASH_OUTFLOWS_OPER_ACT AS operating_cashoutflow, STOT_CASH_INFLOWS_INV_ACT AS investment_cashinflow, STOT_CASH_OUTFLOWS_INV_ACT AS investment_cashoutflow, NET_CASH_FLOWS_INV_ACT AS investment_cashflow, STOT_CASH_INFLOWS_FNC_ACT AS finance_cashinflow, STOT_CASH_OUTFLOWS_FNC_ACT AS finance_cashoutflow, NET_CASH_FLOWS_FNC_ACT AS finance_cashflow FROM wind.ASHARECASHFLOW WHERE SUBSTR(S_INFO_WINDCODE, 1, 1) != 'A' AND STATEMENT_TYPE in (408001000, 408004000, 408005000, 408050000) AND REPORT_PERIOD in {0} AND ACTUAL_ANN_DT <= {1} '''.format(tuple_str, date) df = db.query_by_SQL("wind", sql) return df, date
def load_data(cls, date, code_list=None): print("Downloading data ...") tuple_str = "(" for s in report_period_generator(period=32, date=date): tuple_str += "'" + s + "'," tuple_str = tuple_str[:-1] tuple_str += ")" sql = ''' SELECT S_INFO_WINDCODE, REPORT_PERIOD, ACTUAL_ANN_DT, STATEMENT_TYPE, TOT_ASSETS AS total_assets, TOT_SHRHLDR_EQY_EXCL_MIN_INT AS total_equities_exc_min, TOT_SHRHLDR_EQY_INCL_MIN_INT AS total_equities_inc_min, TOT_NON_CUR_LIAB AS noncur_liabilities, TOT_LIAB AS total_liabilities, LT_BORROW AS longterm_loan, BONDS_PAYABLE AS bonds_payable, LT_PAYABLE AS longterm_payable, OTHER_EQUITY_TOOLS_P_SHR AS preferred_stock, MONETARY_CAP AS cash, TRADABLE_FIN_ASSETS AS tradable_financialasset, NOTES_RCV AS notes_receiveable, ACCT_RCV AS accounts_receivable, INVENTORIES AS inventory, FIX_ASSETS AS fixed_asset, CONST_IN_PROG AS construction_inprogress, INTANG_ASSETS AS intangible_asset, R_AND_D_COSTS AS development_expenditure, GOODWILL AS goodwill, NOTES_PAYABLE AS notes_payable, ACCT_PAYABLE AS accounts_payable FROM wind.ASHAREBALANCESHEET WHERE SUBSTR(S_INFO_WINDCODE, 1, 1) != 'A' AND STATEMENT_TYPE in (408001000, 408004000, 408005000, 408050000) AND REPORT_PERIOD in {0} AND ACTUAL_ANN_DT <= {1} '''.format(tuple_str, date) df = db.query_by_SQL("wind", sql) return df, date
def fit(cls, datas): print("Processing data ...") df, date = datas df = pd.merge(df, get_listed_stocks(), on="s_info_windcode") df.sort_values(by=[ "s_info_windcode", "report_period", "actual_ann_dt", "statement_type" ], inplace=True) current_df = df.groupby(by="s_info_windcode").last() data_series = [(dt.datetime.strftime(dt.datetime.now(), "%Y%m%d"), list(current_df.index), current_df)] for snapshot_date in report_period_generator(period=24, date=date): # df_slice = df[(df["actual_ann_dt"] <= snapshot_date) & (df["report_period"] <= snapshot_date)].copy() df_slice = df[df["report_period"] <= snapshot_date].copy() df_slice.sort_values(by=[ "s_info_windcode", "report_period", "actual_ann_dt", "statement_type" ], inplace=True) df_slice = df_slice.groupby(by="s_info_windcode").last() data_series.append([snapshot_date, list(df_slice.index), df_slice]) data_df = pd.DataFrame(data_series, columns=["date", "ticker_list", "report"]) for ticker in data_df.loc[0, "ticker_list"]: level = 0 while level < 20 and ticker in data_df.loc[level + 1, "ticker_list"]: level += 1 data_df.loc[0, "report"].loc[ticker, "max_level"] = level data_df.loc[0, "report"]["max_level"] = data_df.loc[ 0, "report"]["max_level"].astype(int) print("Making snapshots ...") ttm_df = pd.DataFrame( columns=list(data_df.loc[0, "report"].columns)[3:-1]) for ticker in data_df.loc[0, "ticker_list"]: if data_df.loc[0, "report"].loc[ticker, "max_level"] >= 5: shift = Quarters2LastDec(date) ttm_df.loc[ticker, :] = data_df.loc[0, "report"].ix[ticker, 3:] \ + data_df.loc[shift, "report"].ix[ticker, 3:] - data_df.loc[5, "report"].ix[ticker, 3:] ttm_df = ttm_df.reset_index() ttm_df.rename(columns={'index': 'code'}, inplace=True) ttm_df["trade_date"] = date ttm_df["trade_date"] = date new_cols = list(ttm_df.columns) new_cols.insert(0, new_cols.pop(new_cols.index("trade_date"))) ttm_df = ttm_df[new_cols] for col in ttm_df.columns[2:]: ttm_df[col + "_snapshots"] = None for col in ttm_df.columns: if col.endswith("_snapshots"): for index in ttm_df.index: ttm_df.set_value(index, col, dict()) ttm_df.index = ttm_df["code"] for k in range(20, 0, -1): snapshot_date = data_df.loc[k, "date"] shift = Quarters2LastDec(snapshot_date) for ticker in data_df.loc[0, "ticker_list"]: if data_df.loc[0, "report"].loc[ticker, "max_level"] >= k + 4: for col in data_df.loc[k + shift, "report"].columns[3:]: ttm_df.loc[ticker, col + "_snapshots"][snapshot_date] = data_df.loc[k, "report"].loc[ticker, col] \ + data_df.loc[k + shift, "report"].loc[ticker, col] - data_df.loc[k + 4, "report"].loc[ticker, col] print("Converting some NaN to 0 ...") df = ttm_df for col in df.columns: if col.endswith("_snapshots"): df[col] = df[col].astype(str) for factor in [ "net_income", "total_revenue", "revenue", "total_opcost", "operating_cost", "sale_expense", "management_expense", "research_expense", "financial_expense", "operating_profit" ]: df[factor + "_snapshots"] = df[factor + "_snapshots"].apply( lambda s: s.replace("{}", '')) if factor not in ["net_income", "total_revenue"]: df[factor] = df[factor].apply(lambda x: 0 if pd.isna(x) else x) df[factor + "_snapshots"] = df[factor + "_snapshots"].apply( lambda s: s.replace("nan", '0')) return df
def fit(cls, datas): df, date = datas print("Processing data ...") df = pd.merge(df, get_listed_stocks(), on="s_info_windcode") df.sort_values(by=["s_info_windcode", "report_period", "actual_ann_dt", "statement_type"], inplace=True) current_df = df.groupby(by="s_info_windcode").last() data_series = [(dt.datetime.strftime(dt.datetime.now(), "%Y%m%d"), list(current_df.index), current_df)] for snapshot_date in report_period_generator(period=20, date=date): df_slice = df[(df["actual_ann_dt"] <= snapshot_date) & (df["report_period"] <= snapshot_date)].copy() df_slice.sort_values(by=["s_info_windcode", "report_period", "actual_ann_dt", "statement_type"], inplace=True) df_slice = df_slice.groupby(by="s_info_windcode").last() data_series.append([snapshot_date, list(df_slice.index), df_slice]) data_df = pd.DataFrame(data_series, columns=["date", "ticker_list", "report"]) for ticker in data_df.loc[0, "ticker_list"]: level = 0 while level < 20 and ticker in data_df.loc[level + 1, "ticker_list"]: level += 1 data_df.loc[0, "report"].loc[ticker, "max_level"] = level data_df.loc[0, "report"]["max_level"] = data_df.loc[0, "report"]["max_level"].astype(int) df = data_df.loc[0, "report"].copy() df.drop(["report_period", "actual_ann_dt", "statement_type", "max_level"], axis=1, inplace=True) for factor in df.columns: df[factor + "_snapshots"] = [dict(), ] * df.shape[0] for col in df.columns: if col.endswith("_snapshots"): for index in df.index: df.set_value(index, col, dict()) print("Making snapshots ...") for ticker in data_df.loc[0, "ticker_list"]: max_level = data_df.loc[0, "report"].loc[ticker, "max_level"] for factor in data_df.loc[0, "report"].columns[ 3: -1]: # Exclude ["report_period", "actual_ann_dt", "statement_type", "max_level"] for level in range(max_level, 0, -1): df.loc[ticker, factor + "_snapshots"][data_df.loc[level, "date"]] = \ data_df.loc[level, "report"].loc[ticker, factor] for col in df.columns: if col.endswith("_snapshots"): df[col] = df[col].astype(str) print("Converting some NaN to 0 ...") for factor in ['total_assets', 'total_equities_exc_min', 'total_equities_inc_min', 'noncur_liabilities', 'total_liabilities', 'longterm_loan', 'bonds_payable', 'longterm_payable', 'preferred_stock', "cash", "tradable_financialasset", "notes_receiveable", "accounts_receivable", "inventory", "fixed_asset", "construction_inprogress", "intangible_asset", "development_expenditure", "goodwill", "notes_payable", "accounts_payable"]: df[factor + "_snapshots"] = df[factor + "_snapshots"].apply(lambda s: s.replace("{}", '')) if factor not in ['total_assets', 'total_equities_exc_min', 'total_equities_inc_min', 'noncur_liabilities', 'total_liabilities']: df[factor] = df[factor].apply(lambda x: 0 if pd.isna(x) else x) df[factor + "_snapshots"] = df[factor + "_snapshots"].apply(lambda s: s.replace("nan", '0')) df = df.reset_index() df.rename(columns={'s_info_windcode': 'code'}, inplace=True) df["trade_date"] = date new_cols = list(df.columns) new_cols.insert(0, new_cols.pop(new_cols.index("trade_date"))) # Put trade_date into the first column df = df[new_cols] return df
def fit(cls, datas): print("Processing data ...") df, date = datas df = pd.merge(df, get_listed_stocks(), on="s_info_windcode") df.sort_values(by=[ "s_info_windcode", "report_period", "actual_ann_dt", "statement_type" ], inplace=True) current_df = df.groupby(by="s_info_windcode").last() data_series = [(dt.datetime.strftime(dt.datetime.now(), "%Y%m%d"), list(current_df.index), current_df)] for snapshot_date in report_period_generator(period=24, date=date): df_slice = df[df["report_period"] <= snapshot_date].copy() df_slice.sort_values(by=[ "s_info_windcode", "report_period", "actual_ann_dt", "statement_type" ], inplace=True) df_slice = df_slice.groupby(by="s_info_windcode").last() data_series.append([snapshot_date, list(df_slice.index), df_slice]) data_df = pd.DataFrame(data_series, columns=["date", "ticker_list", "report"]) for ticker in data_df.loc[0, "ticker_list"]: level = 0 while level < 20 and ticker in data_df.loc[level + 1, "ticker_list"]: level += 1 data_df.loc[0, "report"].loc[ticker, "max_level"] = level data_df.loc[0, "report"]["max_level"] = data_df.loc[ 0, "report"]["max_level"].astype(int) print("Calculating TTM ...") shift = Quarters2LastDec(date) for ticker in data_df.loc[0, "ticker_list"]: if data_df.loc[0, "report"].loc[ticker, "max_level"] >= 5: for col in data_df.loc[0, "report"].columns[3:-1]: data_df.loc[0, "report"].loc[ticker, col] = data_df.loc[0, "report"].loc[ticker, col] \ + data_df.loc[shift, "report"].loc[ticker, col] - data_df.loc[5, "report"].loc[ticker, col] print("Making snapshots ...") df = data_df.loc[0, "report"].copy() df.drop( ["report_period", "actual_ann_dt", "statement_type", "max_level"], axis=1, inplace=True) for factor in df.columns: df[factor + "_snapshots"] = [ dict(), ] * df.shape[0] for col in df.columns: if col.endswith("_snapshots"): for index in df.index: df.set_value(index, col, dict()) for ticker in data_df.loc[0, "ticker_list"]: max_level = data_df.loc[0, "report"].loc[ticker, "max_level"] for factor in data_df.loc[0, "report"].columns[ 3: -1]: # Exclude ["report_period", "actual_ann_dt", "statement_type", "max_level"] for level in range(max_level, 0, -1): df.loc[ticker, factor + "_snapshots"][data_df.loc[level, "date"]] = \ data_df.loc[level, "report"].loc[ticker, factor] for col in df.columns: if col.endswith("_snapshots"): df[col] = df[col].astype(str) print("Converting some NaN to 0 ...") for col in df.columns: if col.endswith("_snapshots"): df[col] = df[col].astype(str) for factor in [ "operating_cashflow", "operating_cashinflow", "operating_cashoutflow", "investment_cashinflow", "investment_cashoutflow", "investment_cashflow", "finance_cashinflow", "finance_cashoutflow", "finance_cashflow" ]: df[factor + "_snapshots"] = df[factor + "_snapshots"].apply( lambda s: s.replace("{}", '')) df[factor] = df[factor].apply(lambda x: 0 if pd.isna(x) else x) df[factor + "_snapshots"] = df[factor + "_snapshots"].apply( lambda s: s.replace("nan", '0')) df = df.reset_index() df.rename(columns={'s_info_windcode': 'code'}, inplace=True) df["trade_date"] = date new_cols = list(df.columns) new_cols.insert(0, new_cols.pop(new_cols.index( "trade_date"))) # Put trade_date into the first column df = df[new_cols] return df