def return_stock_in_bankuai(self, bankuai): # bankuai parameter is a list from the top bankuai to the bottom bankuai in the format below # ['板块','概念板块','AB股票'] # # return ['板块','概念板块','AB股票',[[code,name],[code,name],...]] bankuai_tree = self.__bankuai_tree if not bankuai[2] in bankuai_tree[bankuai[0]]["children"][bankuai[1]]["children"]: raise RuntimeError,("The url of [" + ",".join(bankuai) + "] is not correct.","in Eastmoney.py") bankuai_detail_url = self.return_url_for_bankuai_stock(bankuai) while True: # Infinite loop unitl stock download completes successfully try: bankuai_detail_page = read_url(bankuai_detail_url) break except: warn_log('Connection lost, retry in 10 seconds ...') time.sleep(10) r_return_code_detail_grp = r'\[(?P<code_detail_grp>.*)\]' code_detail_grp = re.search(r_return_code_detail_grp, bankuai_detail_page).group("code_detail_grp") r_return_code_detail = r'"(?P<code_detail>[^"]*)"' r_code_detail = re.compile(r_return_code_detail) stocks = [] for m in r_code_detail.finditer(code_detail_grp): match_group_into_list = m.group("code_detail").split(",") stocks.append([match_group_into_list[1],match_group_into_list[2].decode("utf-8")]) out_list = copy.copy(bankuai) out_list.append(stocks) return out_list
def download_to_file(stocks, stock_obj_name, start_date, end_date, to_file, log_fh, warn_fh): #-- iterate stocks, download eod data from webside fh = open(to_file, 'a') num = 0 for s in stocks: #-- call method of stock object to get content of url try: new_class = '%(object)s("%(stock)s", "%(start_date)s", "%(end_date)s")' % { 'object': stock_obj_name, 'stock': s, 'start_date': start_date if stock_obj_name == 'Yahoo_stock' else 'dummy', 'end_date': end_date if stock_obj_name == 'Yahoo_stock' else 'dummy' } print_log(new_class) while True: # Infinite loop unitl stock download completes successfully try: obj = eval(new_class) for k, v in obj.get_stock_content().items(): print_log( '%(num)s - Writing %(code)s ...' % { 'num': num, 'code': k }, log_fh) if re.match(r'pv_none_match', v) or re.match( r'.+"";$', v): # match empty from tengxun and sina warn_log('No content fetched for ' + k, warn_fh) else: fh.write(v + '\n') num += 1 break except: warn_log('Connection lost, retry in 10 seconds ...') time.sleep(10) except KeyError: warn_log(s[0:2] + ' is not setup in ' + stock_obj_name, warn_fh) continue except HTTPError: # log and skip for stocks couldn't be returned from yahoo interface warn_log('Get content failed when ' + new_class, warn_fh) continue fh.close() print_log( '{num} stocks have been written into {file}.'.format(num=num, file=to_file), log_fh)
def return_bankuai_in_bankuai(self, bankuai, sort_direction="desc"): # bankuai parameter is a list from the top bankuai to the bottom bankuai in the format below # ['板块','概念板块'] # return # ['板块','概念板块',[ # [bankuai_name,increase,amount(in 0.1billion),change_ratio,rising_count,falling_count,leading_stock_code,leading_stock_name,increase], # [bankuai_name,increase,amount(in 0.1billion),change_ratio,rising_count,falling_count,leading_stock_code,leading_stock_name,increase], # ... # ] if not sort_direction.lower() in ["desc","asc"]: raise RuntimeError,("Incorrect parameter [%(direction)s]" % {"direction": sort_direction},"in Eastmoney.py") bankuai_url = self.return_url_for_bankuai_bankuai(bankuai) while True: # Infinite loop unitl stock download completes successfully try: bankuai_page = read_url(bankuai_url) break except: warn_log('Connection lost, retry in 10 seconds ...') time.sleep(10) r_return_bankuai_detail_grp = r'\[\[(?P<bankuai_detail_group_desc>[^\]]+)\],\[(?P<bankuai_detail_group_asc>[^\]]+)\]\]' match_objs = re.search(r_return_bankuai_detail_grp, bankuai_page) bankuai_detail_grp = match_objs.group("bankuai_detail_group_" + sort_direction) r_return_code_detail = r'"(?P<code_detail>[^"]*)"' r_code_detail = re.compile(r_return_code_detail) bankuais = [] for m in r_code_detail.finditer(bankuai_detail_grp): match_group_into_list = m.group("code_detail").split(",") bankuais.append([match_group_into_list[0],match_group_into_list[1],match_group_into_list[2],match_group_into_list[3],match_group_into_list[4],match_group_into_list[5],match_group_into_list[6],match_group_into_list[7],match_group_into_list[8]]) out_list = copy.copy(bankuai) out_list.append(bankuais) return out_list
def download_to_file(stocks, stock_obj_name, start_date, end_date, to_file, log_fh, warn_fh): # -- iterate stocks, download eod data from webside fh = open(to_file, "a") num = 0 for s in stocks: # -- call method of stock object to get content of url try: new_class = '%(object)s("%(stock)s", "%(start_date)s", "%(end_date)s")' % { "object": stock_obj_name, "stock": s, "start_date": start_date if stock_obj_name == "Yahoo_stock" else "dummy", "end_date": end_date if stock_obj_name == "Yahoo_stock" else "dummy", } print_log(new_class) while True: # Infinite loop unitl stock download completes successfully try: obj = eval(new_class) for k, v in obj.get_stock_content().items(): print_log("Writing %(code)s ..." % {"code": k}, log_fh) if re.match(r"pv_none_match", v) or re.match(r'.+"";$', v): # match empty from tengxun and sina warn_log("No content fetched for " + k, warn_fh) else: fh.write(v + "\n") num += 1 break except: warn_log("Connection lost, retry in 10 seconds ...") time.sleep(10) except KeyError: warn_log(s[0:2] + " is not setup in " + stock_obj_name, warn_fh) continue except HTTPError: # log and skip for stocks couldn't be returned from yahoo interface warn_log("Get content failed when " + new_class, warn_fh) continue fh.close() print_log("{num} stocks have been written into {file}.".format(num=num, file=to_file), log_fh)
def loader(queue, conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id, merge_before_copy=options.merge_before_copy, enable_copy=options.enable_copy): cur_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d') stock_list_sql = ''' select row_id, biz_date, stock_id from dw.log_stock_transaction where biz_date = '{biz_date}' and is_download_success = 'Y' and (is_load_success = 'N' or is_load_success is null) ''' if not stock_id is None: stock_list_sql = stock_list_sql + ' and stock_id = \'' + stock_id + '\'' cur = get_cur(conn) while cur_date_dt <= end_date_dt: if merge_before_copy: # since load files one by one into table is taking too much time, the solution to boost the procedure is to merge all the pieces of files into one file and load the merge file into table, this takes less than 5 mins to complete. cur_date_str = cur_date_dt.strftime('%Y%m%d') working_dir = data_dir + SEP + cur_date_str file_merged = os.path.join(working_dir, "file_merged.csv") if os.path.exists(file_merged): warn_log('Removing old file: ' + file_merged) os.remove(file_merged) #-- Starting to merge files with open(file_merged, "a") as dest: i=0 for _, _, filenames in os.walk(working_dir): for filename in fnmatch.filter(filenames, "[0-9]*.txt"): with open(os.path.join(working_dir, filename)) as src: shutil.copyfileobj(src, dest) i+=1 print_log('Merged ' + str(i) + ' files.') #-- Deleting records from db del_sql = '''delete from dw.stock_transaction where biz_date = '{}' '''.format(cur_date_str) get_query_result(conn, del_sql) conn.commit() print_log('Deletion for biz_date {} completed successfully.'.format(cur_date_str)) #-- Updating is_load_success to N in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'N' where biz_date = '{}' and is_download_success = 'Y' '''.format(cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to N') #++++++++ Starting to load the merged file into table psql_copy_from(DB_HOST, DB_NAME, DB_UNAME, 'dw.stock_transaction', file_merged, DB_PORT, args=' with (encoding \'GBK\')') print_log('Successfully loaded {} into table.'.format(file_merged)) #-- Updating is_load_success to Y in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'Y' where biz_date = '{}' and is_download_success = 'Y' '''.format(cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to Y') #-- Cleaning up working dir os.remove(file_merged) cur_date_dt = cur_date_dt + datetime.timedelta(1) else: stock_list_sql_var_replaced = stock_list_sql.format(biz_date=cur_date_dt) cur.execute(stock_list_sql_var_replaced) rows = list(cur) for row in rows: row_id = row['row_id'] biz_date = str(row['biz_date']).replace('-','') stock_id = row['stock_id'] while queue.full(): print_log('=================> queue is full, wait for 1 second...') time.sleep(1) s = Stock_trans_loader(queue, conn, row_id, stock_id, biz_date, enable_copy=enable_copy ) s.start() print_log('-----> queue size: ' + str(queue.qsize())) conn.commit() cur_date_dt = cur_date_dt + datetime.timedelta(1) while not queue.empty(): print_log('=================> queue is not empty yet, wait for 1 second...') time.sleep(1)
else: exit_error("table is not correct! [" + options.table + "]") #-- replace $DATE, it will check the existence of files, if file doesn't exist, it wouldn't be added to the loading list start_dt_dt = datetime.datetime.strptime(start_date, "%Y%m%d") end_dt_dt = datetime.datetime.strptime(end_date, "%Y%m%d") for k,v in files_to_load.items(): dt_replaced = [] process_dt_dt = start_dt_dt while process_dt_dt <= end_dt_dt: process_dt = datetime.datetime.strftime(process_dt_dt, "%Y%m%d") if os.path.isfile(v[0].replace("$DATE", process_dt)): dt_replaced.append(v[0].replace("$DATE", process_dt)) else: warn_log(v[0].replace("$DATE", process_dt) + " doesn't exist." ) process_dt_dt = process_dt_dt + datetime.timedelta(1) files_to_load[k] = dt_replaced #-- Start to load for t in load_seq_tables: if t in files_to_load: for f in files_to_load[t]: cmd = "%(func_name)s(%(param)s)" % {"func_name": table_mapping[t]["func_name"], "param": table_mapping[t]["param"]} cmd_with_filename = cmd.replace("$f", f.replace('\\', '\\\\')) # replace \\ with \\\\ is just for windows platform, unix/linux platform won't be impacted eval(cmd_with_filename) conn.close()
else: exit_error("table is not correct! [" + options.table + "]") # -- replace $DATE, it will check the existence of files, if file doesn't exist, it wouldn't be added to the loading list start_dt_dt = datetime.datetime.strptime(start_date, "%Y%m%d") end_dt_dt = datetime.datetime.strptime(end_date, "%Y%m%d") for k, v in files_to_load.items(): dt_replaced = [] process_dt_dt = start_dt_dt while process_dt_dt <= end_dt_dt: process_dt = datetime.datetime.strftime(process_dt_dt, "%Y%m%d") if os.path.isfile(v[0].replace("$DATE", process_dt)): dt_replaced.append(v[0].replace("$DATE", process_dt)) else: warn_log(v[0].replace("$DATE", process_dt) + " doesn't exist.") process_dt_dt = process_dt_dt + datetime.timedelta(1) files_to_load[k] = dt_replaced # -- Start to load for t in load_seq_tables: if t in files_to_load: for f in files_to_load[t]: cmd = "%(func_name)s(%(param)s)" % { "func_name": table_mapping[t]["func_name"], "param": table_mapping[t]["param"], } cmd_with_filename = cmd.replace( "$f", f.replace("\\", "\\\\") ) # replace \\ with \\\\ is just for windows platform, unix/linux platform won't be impacted eval(cmd_with_filename)
def loader(queue, conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id, merge_before_copy=options.merge_before_copy, enable_copy=options.enable_copy): cur_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d') stock_list_sql = ''' select row_id, biz_date, stock_id from dw.log_stock_transaction where biz_date = '{biz_date}' and is_download_success = 'Y' and (is_load_success = 'N' or is_load_success is null) ''' if not stock_id is None: stock_list_sql = stock_list_sql + ' and stock_id = \'' + stock_id + '\'' cur = get_cur(conn) while cur_date_dt <= end_date_dt: if merge_before_copy: # since load files one by one into table is taking too much time, the solution to boost the procedure is to merge all the pieces of files into one file and load the merge file into table, this takes less than 5 mins to complete. cur_date_str = cur_date_dt.strftime('%Y%m%d') working_dir = data_dir + SEP + cur_date_str file_merged = os.path.join(working_dir, "file_merged.csv") if os.path.exists(file_merged): warn_log('Removing old file: ' + file_merged) os.remove(file_merged) #-- Starting to merge files with open(file_merged, "a") as dest: i = 0 for _, _, filenames in os.walk(working_dir): for filename in fnmatch.filter(filenames, "[0-9]*.txt"): with open(os.path.join(working_dir, filename)) as src: shutil.copyfileobj(src, dest) i += 1 print_log('Merged ' + str(i) + ' files.') #-- Deleting records from db del_sql = '''delete from dw.stock_transaction where biz_date = '{}' '''.format( cur_date_str) get_query_result(conn, del_sql) conn.commit() print_log( 'Deletion for biz_date {} completed successfully.'.format( cur_date_str)) #-- Updating is_load_success to N in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'N' where biz_date = '{}' and is_download_success = 'Y' '''.format( cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to N') #++++++++ Starting to load the merged file into table psql_copy_from(DB_HOST, DB_NAME, DB_UNAME, 'dw.stock_transaction', file_merged, DB_PORT, args=' with (encoding \'GBK\')') print_log('Successfully loaded {} into table.'.format(file_merged)) #-- Updating is_load_success to Y in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'Y' where biz_date = '{}' and is_download_success = 'Y' '''.format( cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to Y') #-- Cleaning up working dir os.remove(file_merged) cur_date_dt = cur_date_dt + datetime.timedelta(1) else: stock_list_sql_var_replaced = stock_list_sql.format( biz_date=cur_date_dt) cur.execute(stock_list_sql_var_replaced) rows = list(cur) for row in rows: row_id = row['row_id'] biz_date = str(row['biz_date']).replace('-', '') stock_id = row['stock_id'] while queue.full(): print_log( '=================> queue is full, wait for 1 second...' ) time.sleep(1) s = Stock_trans_loader(queue, conn, row_id, stock_id, biz_date, enable_copy=enable_copy) s.start() print_log('-----> queue size: ' + str(queue.qsize())) conn.commit() cur_date_dt = cur_date_dt + datetime.timedelta(1) while not queue.empty(): print_log( '=================> queue is not empty yet, wait for 1 second...') time.sleep(1)