def update_sfd_in_list(stock_id_list, sfd_dir, smd_dir, days, force_update=False): now = datetime.datetime.now() if 8 <= now.hour <= 15: return update_log_path = sfd_dir + "/update.log" if is_smd_need_update(update_log_path) or force_update: size = len(stock_id_list) id_list_list = [] if size > 200: for i in range(0, size, 200): id_list_list.append(stock_id_list[i:i + 200]) t_list = [] for i, cur_id_list in enumerate(id_list_list): t = threading.Thread(target=t_update_sfd_in_list, args=(cur_id_list, sfd_dir, smd_dir, days)) t_list.append(t) t.start() for t in t_list: while t.is_alive(): tools.delay(10) communicate.master_close() update_log_file = open(update_log_path, 'w') update_log_file.write(datetime.datetime.now().strftime("%Y/%m/%d/%H")) update_log_file.close()
def update_livedata_dict(stock_id_list, livedata_dict): now = datetime.datetime.now() if (7 * 60 + 30) < (now.hour * 60 + now.minute) < (8 * 60 + 30): tools.delay(300) return logger.logp("update_livedata : start") t_start = datetime.datetime.now() size = len(stock_id_list) id_list_list = [] if size > 100: for i in range(0, size, 100): id_list_list.append(stock_id_list[i:i + 100]) for i, cur_id_list in enumerate(id_list_list): logger.logp("get live data {} / {}".format(i + 1, len(id_list_list))) livedata_list = crawler.get_livedata_list(cur_id_list) logger.logp("get live data OK") if livedata_list is not None: logger.logp("read live data list") read_livedata_list(livedata_list, livedata_dict) logger.logp("read live data list OK") t_end = datetime.datetime.now() logger.logp("update_livedata : Total time = {} s".format( (t_end - t_start).total_seconds())) logger.logp("update_livedata : Done")
def get_month_data(year, month, stock_id): logger.logp("Get month data: {} {}".format(year, month)) arg = "STOCK_DAY?response=json&date={}{:02d}01&stockNo={}".format( year, month, stock_id) url = "http://www.twse.com.tw/exchangeReport/" + arg tools.delay(5) # delay max_try = 3 while True: logger.logp("Trying connection...") from socket import timeout try: res = urllib.request.urlopen(url, timeout=10) logger.logp("OK") except timeout: logger.logp("Error: urllib -- timeout") tools.wait_retry(logger, 10) continue except Exception as e: logger.logp("Error: urllib") logger.logp(e) tools.wait_retry(logger, 30) continue logger.logp("Trying json decode...") data = "" try: data = json.loads(res.read().decode()) if data["stat"] != "OK": if data["stat"] == "很抱歉,沒有符合條件的資料!": return [] logger.logp("data error: stat = {}".format(data["stat"])) tools.wait_retry(logger, 5) if max_try == 0: return None max_try -= 1 continue except: logger.logp("Error: json \"{}\"".format(data)) tools.wait_retry(logger, 5) continue # check content date if tools.check_smd_content_by_key(data["data"][0], year * 100 + month): return data["data"] else: logger.logp("error content: {} {}".format(year * 100 + month, data["data"])) tools.wait_retry(logger, 5) continue
def main_loop(): logger = msgopt.Logger("main", print) global is_ready while True: updated = True dtd_updated = True logger.logp("update_listed_list : start") dataio.update_listed_list(listed_sid_path) logger.logp("update_listed_list : done\n") dataio.update_all_dtd(dtd_dir, months) listed_id_list = dataio.get_stock_id_list(listed_sid_path) logger.logp("update_smd_in_list : start") force_update = False dataio.update_smd_in_list( listed_id_list, trade_data_dir, months, force_update) logger.logp("update_smd_in_list : done\n") dataio.update_sfd_in_list( listed_id_list, sfd_dir, trade_data_dir, 365 * 4, force_update) dataio.update_livedata_dict(listed_id_list, livedata_dict) is_ready = True while True: now = datetime.datetime.now() if now.hour == 15 and not updated: break if now.hour == 1 and not dtd_updated: break if not updated: dataio.update_livedata_dict(listed_id_list, livedata_dict) if 8 <= now.hour < 14: if updated: updated = False continue if now.hour == 0: if dtd_updated: dtd_updated = False # # debug # dataio.update_livedata_dict(listed_id_list, livedata_dict) # continue # # end debug logger.logp("sleep 300s ...\n") tools.delay(300)
def get_livedata_list(stock_id_list): delay = 4 max_try = 3 while max_try > 0: tools.delay(delay) try: logger.logp("connecting to livedata...") url = "http://163.29.17.179/stock/fibest.jsp" cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) logger.logp("opening url fibest...") opener.open(url) logger.logp("url fibest opened.") stock_arg = "" for stock_id in stock_id_list: stock_arg += "tse_{}.tw|".format(stock_id) arg = "getStockInfo.jsp?ex_ch={}&json=1&delay=0&_={}".format( stock_arg, int(time.time() * 1000)) url = "http://163.29.17.179/stock/api/" + arg request = urllib.request.Request(url) logger.logp("opening url getStockInfo...") res = opener.open(request, timeout=10) logger.logp("url getStockInfo opened.") except: logger.logp("Error: connection") max_try -= 1 continue try: livedata_list = json.loads(res.read().decode()) if livedata_list["rtmessage"] != "OK": logger.logp("Error: data") max_try -= 1 continue except: logger.logp("Error: json") max_try -= 1 continue return livedata_list["msgArray"] tools.delay(30) return None
def update_smd_in_list(stock_id_list, smd_dir, months, force_update=False): update_log_path = smd_dir + "/update.log" if not is_smd_need_update(update_log_path) and not force_update: return communicate.master_start() split_num = len(communicate.slaves) + 1 total = len(stock_id_list) subtotal = int(total / split_num) + 1 t_list = [] stock_id_sublist_master = None slave_id = -1 for i in range(0, total, subtotal): if i == 0: stock_id_sublist_master = stock_id_list[i:i + subtotal] continue stock_id_sublist = stock_id_list[i:i + subtotal] slave_en = True slave_id += 1 t = threading.Thread(target=t_update_smd_in_list, args=(stock_id_sublist, smd_dir, months, slave_en, slave_id)) t_list.append(t) t.start() t_update_smd_in_list(stock_id_sublist_master, smd_dir, months, slave_en=False) for t in t_list: while t.is_alive(): tools.delay(10) update_log_file = open(update_log_path, 'w') update_log_file.write(datetime.datetime.now().strftime("%Y/%m/%d/%H")) update_log_file.close()
def add_ms_ac_info(input_file, output_file): papers = load_json_lines(input_file) papers = log_stream(papers, name='Input') papers_parsed = ({ 'ms_academic': get_mc_ac_paper(expr="and(Ti='" + normalize_title(p['title']) + "',Y>=2014)"), **p } for p in delay(papers, 2)) papers_parsed_printed = log_stream(papers_parsed, name='Output') write_json_lines(papers_parsed_printed, output_file)
def debug_loop(): while True: tools.delay(5) if is_ready: print(livedata_dict)
def get_full_data(stock_id, yyyymmdd): delay = 0.3 max_try = 5 while max_try > 0: tools.delay(delay) logger.logp("connect {} {}".format(stock_id, yyyymmdd)) try: args = "?action=r&id={}&date={}".format(stock_id, yyyymmdd) url = "http://www.cmoney.tw/notice/chart/stockchart.aspx" + args cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) res = opener.open(url, timeout=10).read().decode() except: logger.logp("Error: connection") max_try -= 1 tools.delay(3) continue try: i = res.find("var ck") s = res.find('"', i) + 1 e = res.find('"', s) ck = res[s:e] except: logger.logp("Error: parse ck") max_try -= 1 tools.delay(3) continue try: args += "&ck=" + ck url2 = "http://www.cmoney.tw/notice/chart/stock-chart-service.ashx" + args request = urllib.request.Request(url2) request.add_header("Referer", url) res = opener.open(request, timeout=10) except: logger.logp("Error: connection") max_try -= 1 tools.delay(3) continue try: content = json.loads(res.read().decode()) if content["ErrorCode"] == 0: return content else: if content["ErrorCode"] == 124554: return {} logger.logp("ErrorCode: {}".format(content["ErrorCode"])) max_try -= 1 tools.delay(3) continue except: logger.logp("Error: json") max_try -= 1 tools.delay(3) continue return None
def get_day_trading_data(yyyymmdd): logger.logp("get_day_trading_data: {}".format(yyyymmdd)) url = "http://www.twse.com.tw/exchangeReport/TWTB4U?response=json&date={}&selectType=All".format( yyyymmdd) tools.delay(3) # delay max_try = 3 while True: logger.logp("Trying connection...") from socket import timeout try: res = urllib.request.urlopen(url, timeout=10) logger.logp("OK") except timeout: logger.logp("Error: urllib -- timeout") tools.wait_retry(logger, 10) continue except: logger.logp("Error: urllib") tools.wait_retry(logger, 30) continue logger.logp("Trying json decode...") # check stat try: data = json.loads(res.read().decode()) if data["stat"] != "OK": logger.logp("data error: stat = {}".format(data["stat"])) tools.wait_retry(logger, 5) if max_try == 0: return None max_try -= 1 continue except: logger.logp("Error: json when checking stat") tools.wait_retry(logger, 5) continue # check date try: if data["date"] != "{}".format(yyyymmdd): logger.logp("data error: date = {}".format(data["date"])) tools.wait_retry(logger, 5) if max_try == 0: return None max_try -= 1 continue except: logger.logp("Error: json when checking date") tools.wait_retry(logger, 5) continue # return return data["data"]
def get_listed_list(): url = "http://isin.twse.com.tw/isin/class_main.jsp?market=1&issuetype=1" max_try = 3 while True: if max_try == 0: return None if max_try != 3: tools.delay(5) max_try -= 1 try: res = urllib.request.urlopen(url) except: logger.logp("Error: get listed id -- urllib") continue content = res.read().decode("cp950", errors='ignore') i_end = content.find("</table>") if i_end < 0: logger.logp("Error: get listed id -- source") continue i = i_end + 10 i_end = content.find("</table>", i) if i_end < 0: logger.logp("Error: get listed id -- source") continue op_str = "" is_first_data = True while i < i_end: stock_id_str = "" ipo_date = "" i = content.find("<tr>", i) if i < 0: break for j in range(3): i = content.find("<td", i + 5) i = content.find('>', i + 5) i += 1 while content[i] != '<': if content[i] != ' ' and content[i] != '\n': stock_id_str += content[i] i += 1 # ipo date for j in range(5): i = content.find("<td", i + 5) i = content.find('>', i + 5) i += 1 while content[i] != '<': if content[i] != ' ' and content[i] != '\n': ipo_date += content[i] i += 1 op = "{},{}".format(stock_id_str, ipo_date) if is_first_data: is_first_data = False else: op = ';' + op op_str += op return op_str