def get_stock_list(stock_type='上市'): df_stocks = None if stock_type == '上市': df_stocks = pd.read_csv(gen_output_path('data', '上市.csv'), engine='python', encoding='big5') elif stock_type == '上櫃': df_stocks = pd.read_csv(gen_output_path('data', '上櫃.csv'), engine='python', encoding='big5') return df_stocks
def test_parse_balance_sheet(self): # stock_list = get_stock_codes(stock_type='上市') + get_stock_codes(stock_type='上櫃') stock_list = [2801] # stock_list = [1409] empty_list = [] non_investment = [] non_property = [] for stock_id in stock_list: # print('parse ', stock_id) file_name = "balance_sheet_data_" + str(stock_id) input_path = gen_output_path('raw_datas/test_balance_sheets', file_name) year = 2020 season = 1 balance_sheet_processor = SimpleBalanceSheetProcessor(stock_id) with open(input_path, 'rb') as in_put: # str_trs = in_put.readlines() raw_input = in_put.read() in_put.close() soup = BeautifulSoup(raw_input, 'html.parser') # print(soup.prettify()) parse_data = balance_sheet_processor.parse_balance_sheet(soup, year, season) if parse_data is not None and len(parse_data) < 2: if len(parse_data) == 0: empty_list.append(stock_id) else: if '長期投資' not in parse_data: non_investment.append(stock_id) if '固定資產' not in parse_data: non_property.append(stock_id) print('empty_list = ', empty_list) print('non_investment = ', non_investment) print('non_property = ', non_property)
def test_get_prediction(self): with open(gen_output_path('data', 'prices.xlsx'), 'rb') as file: df = pd.read_excel(file) file.close() prices = df.loc[:, '收盤價'] # errors = generate_predictions(prices, get_stock_codes(stock_type='上市') + get_stock_codes(stock_type='上櫃')) # print('test_get_prediction errors = ', errors) result = generate_prediction(2841, float(prices.loc[str(2841)])) print(result)
def get_raw_data(input_dir, file_name): input_path = gen_output_path(input_dir, file_name) try: with open(input_path, 'rb') as in_put: raw_input = in_put.read() in_put.close() return raw_input except FileNotFoundError as fnfe: return None
def action(stock_id, time_line): year = time_line['year'] season = time_line.get('season') dir_path = (root_dir_path + str(year)) if season is None else (root_dir_path + str(year) + "Q" + str(season)) file_path = gen_output_path(dir_path, str(stock_id)) if path.exists(file_path) is False: result = fetcher(stock_id, year) if season is None else fetcher(stock_id, year, season) if result is not None: store_raw_data(result, dir_path, str(stock_id))
def fetch_price_measurement_raw_datas(stock_ids): db = mongo_client[DB_TWSE] collection = db[TABLE_TWSE_PRICE_MEASUREMENT] for stock_id in stock_ids: params = {'STOCK_ID': stock_id} path_dir = os.path.abspath(gen_output_path(PATH_DIR_RAW_DATA_PRICE_MEASUREMENT)) url = "https://goodinfo.tw/StockInfo/StockBzPerformance.asp?" + urlencode(params) print('url = ', url) profile = FirefoxProfile() profile.set_preference("browser.download.panel.shown", False) profile.set_preference("browser.download.folderList", 2) profile.set_preference("browser.download.dir", path_dir) profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/html") driver = webdriver.Firefox(firefox_profile=profile) driver.get(url) inputs = driver.find_elements_by_tag_name("input") download_button = None for input_ in inputs: if input_.get_attribute("value") == '匯出HTML': download_button = input_ download_button.click() driver.quit() file_path = os.path.join(path_dir, "BzPerformance.html") try: output_file_path = gen_output_path(PATH_DIR_RAW_DATA_PRICE_MEASUREMENT, str(stock_id)) # copyfile(file_path, output_file_path) with open(file_path, 'rb') as in_put: raw_input = in_put.read() in_put.close() print(raw_input) record = { "stock_id": str(stock_id), "content": raw_input } post_id = collection.insert_one(record) print("post_id = ", post_id) except Exception as inst: __logger.error("get exception in " + str(stock_id) + ":" + str(inst)) traceback.print_tb(inst.__traceback__) finally: os.remove(file_path)
def read_dfs(stock_id, path=None): try: with open( path if path is not None else gen_output_path( 'data', 'performance_{0}.xlsx'.format(str(stock_id))), 'rb') as file: df_profit_matrix = pd.read_excel(file, sheet_name=None) file.close() return df_profit_matrix except FileNotFoundError as err: traceback.print_tb(err.__traceback__) return None
def get_data_frame(self, year=None, season=None, indexType=IndexType.INT_INDEX): # create a new Firefox session path = os.path.abspath(gen_output_path("data")) print("path = ", path) params = {'STOCK_ID': self._stock_id} url = "https://goodinfo.tw/StockInfo/StockBzPerformance.asp?" + urlencode(params) print('url = ', url) profile = FirefoxProfile() profile.set_preference("browser.download.panel.shown", False) profile.set_preference("browser.download.folderList", 2) profile.set_preference("browser.download.dir", path) profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/html") driver = webdriver.Firefox(firefox_profile=profile) driver.get(url) elements = driver.find_elements_by_id("txtFinDetailData") buttons = elements[0].find_elements_by_xpath(".//input") buttons[5].click() driver.quit() file_path = os.path.join(path, "BzPerformance.html") print("path2 = ", file_path) try: df = pd.read_html(file_path)[0] years = self.__correct_years(df.loc[:, 0].tolist()) prices = list(map(lambda price: float(price) if price != '-' else float(0), df.loc[:, 4].tolist())) if indexType == IndexType.INT_INDEX: data = {'年度': years, '平均股價': prices} df = pd.DataFrame(data) df = df.set_index(['年度']) else: # periodIndexes = list(map(lambda year: pd.PeriodIndex(start=pd.Period(year, freq='Y'), # end=pd.Period(year, freq='Y'), # freq='Y'), years)) # print('periodIndex = ', periodIndexes) periodIndex = pd.PeriodIndex(start=pd.Period(years[-1], freq='Y'), end=pd.Period(years[0], freq='Y'), freq='Y') print(periodIndex) print([prices[i:i + 1] for i in range(0, len(prices))]) df = pd.DataFrame(data=[prices[i:i + 1] for i in range(len(prices) - 1, -1, -1)], columns=['平均股價'], index=periodIndex) print('df = ', df) # df['平均股價'] = df['平均股價'].map(lambda price: float(price) if price != '-' else float(0)) # df2.index = pd.PeriodIndex(df2.index, freq='A') except Exception as inst: print("get exception", inst) traceback.print_tb(inst.__traceback__) return None finally: os.remove(file_path) return df
def generate_predictions2(df_prices, stock_ids=[]): error_stock_ids = [] df_predictions = None for stock_id in stock_ids: str_stock_id = str(stock_id) # try: # stock_data = get_stock_data(str_stock_id) # except: # stock_data = None # # if stock_data is None: # continue try: stock_data = read(str_stock_id) s_stock = get_predict_evaluate(stock_data, float(df_prices.loc[str_stock_id])) except Exception as e: print("Unexpected error:", e) traceback.print_tb(e.__traceback__) print('Get error when get stock ', stock_id, ' stock_data = ', stock_data) error_stock_ids.append(stock_id) s_stock = None except: print("Unexpected error:", sys.exc_info()[0]) error_stock_ids.append(stock_id) s_stock = None if s_stock is None: continue if df_predictions is None: df_predictions = pd.DataFrame(columns=s_stock.index, data=[s_stock.values]) df_predictions['股號'] = [str_stock_id] df_predictions = df_predictions.set_index('股號') print("first record") print(df_predictions) print("index = ", df_predictions.index) else: print('get index = ', df_predictions.index) df_predictions.loc[str_stock_id] = s_stock.values print('result = ', df_predictions) print('err_id = ', error_stock_ids) output_path = gen_output_path('data', 'evaluations.xlsx') with pd.ExcelWriter(output_path) as writer: df_predictions.to_excel(writer, sheet_name='predictions')
def generate_predictions(df_prices, stock_ids=[]): error_stock_ids = [] results = None df_predictions = None df_stock_list = _get_stock_list() for stock_id in stock_ids: str_stock_id = str(stock_id) try: result = None if not df_prices.index.contains(str_stock_id) else generate_prediction(stock_id, float(df_prices.loc[str_stock_id])) except Exception as e: print("Unexpected error:", sys.exc_info()[0]) traceback.print_tb(e.__traceback__) error_stock_ids.append(stock_id) result = None if result is not None: if results is None: results = {'公司代號': [stock_id], '公司簡稱': [df_stock_list.loc[str_stock_id, '公司簡稱']]} results.update(dict(map(lambda x: (x[0], [x[1]]), result.items()))) print(list(results.keys())) # df_predictions = pd.DataFrame(result) # df_predictions['股號'] = [str_stock_id] # df_predictions = df_predictions.set_index('股號') print("first record") # print(df_predictions) else: for pair in result.items(): results[pair[0]].append(pair[1]) results['公司代號'].append(stock_id) results['公司簡稱'].append(df_stock_list.loc[str_stock_id, '公司簡稱']) print('results = ', results) df_predictions = pd.DataFrame(results) df_predictions = df_predictions.set_index('公司代號') print("result = ", df_predictions) output_path = gen_output_path('data', 'evaluations.xlsx') with pd.ExcelWriter(output_path) as writer: df_predictions.to_excel(writer, sheet_name='predictions') return error_stock_ids
def store_df(stock_id, dict_sheet_dfs, directory=None, filename=None): if dict_sheet_dfs is None or len( dict_sheet_dfs.items()) == 0 or stock_id is None: print('dict_sheet_dfs and stock_id should not be None') return output_path = gen_output_path( directory if directory is not None else 'data', filename=filename if filename is not None else 'performance_{0}.xlsx'.format(stock_id)) dict_dfs = read_dfs(stock_id, output_path) if dict_dfs is None: dict_dfs = {} for sheet_name, df in dict_sheet_dfs.items(): dict_dfs[sheet_name] = df with pd.ExcelWriter(output_path) as writer: for sheet_name, df in dict_dfs.items(): print('sheet_name = ', sheet_name, " df = ", df) df.to_excel(writer, sheet_name=sheet_name) writer.close()
def _read_df_datas(stock_id): df_statements = read_dfs(stock_id, gen_output_path('data', 'statments_{0}.xlsx'.format(str(stock_id)))) if df_statements is None: df_statements = {} df_profit_statement = df_statements.get('profit_statement', None) df_balance_sheet = df_statements.get('balance_sheet', None) df_cash_flow_statement = df_statements.get('cash_flow_statement', None) df_dividend_policy = df_statements.get('dividend_policy', None) df_performance = df_statements.get('performance', None) print('df_cash_flow_statement = ', df_cash_flow_statement) print('df_dividend_policy = ', df_dividend_policy) print('df_balance_sheet = ', df_balance_sheet) if df_profit_statement is not None: df_profit_statement = df_profit_statement.set_index([df_profit_statement.index.to_period("Q")]) df_statements['profit_statement'] = df_profit_statement print('df_profit_statement = ', df_profit_statement) if df_balance_sheet is not None: df_balance_sheet = df_balance_sheet.set_index([df_balance_sheet.index.to_period("Q")]) df_balance_sheet.columns = ['固定資產', '每股淨值', '長期投資', ('權益總額', '期初餘額'), ('權益總額', '期末餘額')] df_statements['balance_sheet'] = df_balance_sheet print('df_balance_sheet columns = ', df_balance_sheet.columns) if df_cash_flow_statement is not None: df_cash_flow_statement = df_cash_flow_statement.set_index([df_cash_flow_statement.index.to_period("Q")]) df_statements['cash_flow_statement'] = df_cash_flow_statement print('df_cash_flow_statement = ', df_cash_flow_statement) if df_dividend_policy is not None: df_dividend_policy = df_dividend_policy.set_index([df_dividend_policy.index.to_period("Y")]) df_statements['dividend_policy'] = df_dividend_policy print('df_dividend_policy = ', df_dividend_policy) if df_performance is not None: df_performance = df_performance.set_index([df_performance.index.to_period("Y")]) df_statements['performance'] = df_performance return df_statements
def test_predict_evaluation(self): from stock_data import read stock_data = read('2330') self.assertIsNotNone(stock_data) df = None with open(gen_output_path('data', 'prices.xlsx'), 'rb') as file: df = pd.read_excel(file) file.close() s_2330 = get_predict_evaluate(stock_data, float(df.loc['2330', '收盤價'])).rename('2330') result = pd.concat([s_2330], axis=1) print('result 1', result.T) stock_data = read('6294') s_6294 = get_predict_evaluate(stock_data, float(df.loc['6294', '收盤價'])).rename('6294') self.assertIsNotNone(stock_data) result.loc[:, '6294'] = s_6294 print('result 2', result.T) result.loc[:, '6294'] = s_6294 print('result 3', result.T)
def get_prices(): crawler = Crawler() df = crawler.get_data((2021, 9, 7)) with pd.ExcelWriter(gen_output_path('data', 'prices.xlsx')) as writer: df.to_excel(writer) writer.close()
def generate_predictions(stock_ids=[]): try: with open(gen_output_path('data', 'evaluations.xlsx'), 'rb') as file: df_predictions = pd.read_excel(file, sheet_name='predictions', dtype={'股號': str}) df_predictions = df_predictions.set_index('股號') except FileNotFoundError as err: df_predictions = None # print('index =', df_predictions.index) # print(df_predictions) # df_predictions.reset_index() # print('index =', df_predictions.index) # df_predictions.reindex(df_predictions.index.astype('str')) # print('index =', df_predictions.index) # # print(df_predictions) tor_proc = None if not _tor_process_exists(): print('open tor process') tor_proc = _launch_tor() controller = Controller.from_port(port=ctrl_port) controller.authenticate() socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", proxy_port) socket.socket = socks.socksocket newIP = urlopen("http://icanhazip.com").read() print("NewIP Address: %s" % newIP) error_stock_ids = [] for stock_id in stock_ids: str_stock_id = str(stock_id) if df_predictions is not None and str_stock_id in df_predictions.index: continue try: stock_data = get_stock_data(stock_id) except: stock_data = None if stock_data is None: continue break_loop = False while True: try: s_stock = get_predict_evaluate(stock_data) break except (IndexError, requests.exceptions.ConnectionError, NewConnectionError, socks.SOCKS5Error) as e: print("get exception: ", e, " for ", stock_id) controller.signal(Signal.NEWNYM) if not controller.is_newnym_available(): print("Waiting time for Tor to change IP: " + str(controller.get_newnym_wait()) + " seconds") time.sleep(controller.get_newnym_wait()) newIP = urlopen("http://icanhazip.com").read() print("NewIP Address: %s" % newIP) except Exception as e: print("Unexpected error:", e) traceback.print_tb(e.__traceback__) print('Get error when get stock ', stock_id, ' stock_data = ', stock_data) error_stock_ids.append(stock_id) # break_loop = True s_stock = None break if break_loop: break if s_stock is None: continue if df_predictions is None: # df_predictions = pd.DataFrame(columns=s_stock.index, data=[s_stock.values], index=[str_stock_id]) df_predictions = pd.DataFrame(columns=s_stock.index, data=[s_stock.values]) df_predictions['股號'] = [str_stock_id] df_predictions = df_predictions.set_index('股號') print("first record") print(df_predictions) print("index = ", df_predictions.index) else: print('get index = ', df_predictions.index) df_predictions.loc[str_stock_id] = s_stock.values print('result = ', df_predictions) output_path = gen_output_path('data', 'evaluations.xlsx') with pd.ExcelWriter(output_path) as writer: df_predictions.to_excel(writer, sheet_name='predictions') # print('error_ids = ', error_stock_ids) controller.close() tor_proc.terminate()
def get_raw_data(self, input_dir, file_name): input_path = gen_output_path(input_dir, file_name) with open(input_path, 'rb') as in_put: raw_input = in_put.read() in_put.close() return raw_input
def store_raw_data(self, data, output_dir, file_name): if data is not None: output_path = gen_output_path(output_dir, file_name) with open(output_path, 'wb') as output: output.write(data) output.close()
def test_tsec_crawler(self): crawler = Crawler() df = crawler.get_data((2021, 4, 29)) with pd.ExcelWriter(gen_output_path('data', 'prices.xlsx')) as writer: df.to_excel(writer) writer.close()
}]}, {'$set': { "content": raw_data }}, upsert=True) def get_prices(): crawler = Crawler() df = crawler.get_data((2021, 9, 7)) with pd.ExcelWriter(gen_output_path('data', 'prices.xlsx')) as writer: df.to_excel(writer) writer.close() if __name__ == "__main__": logger.info('start') twse_code_list = get_stock_codes(stock_type='上市') tpex_code_list = get_stock_codes(stock_type='上櫃') sync_statements(twse_code_list) sync_statements(tpex_code_list) sync_performance(twse_code_list + tpex_code_list) get_prices() with open(gen_output_path('data', 'prices.xlsx'), 'rb') as file: df = pd.read_excel(file) file.close() prices = df.loc[:, '收盤價'] errors = generate_predictions( prices, get_stock_codes(stock_type='上市') + get_stock_codes(stock_type='上櫃')) print('test_get_prediction errors = ', errors)