def download_log_checker(conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id): start_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d') # get stock ids which is_download_success=N chk_sql = ''' select t.biz_date, t.stock_id from ( select biz_date, stock_id, is_download_success, row_number() over(partition by biz_date, stock_id order by download_end_time desc nulls last) rankid from dw.log_stock_transaction where biz_date between '{start_date}' and '{end_date}' ) t where t.rankid = 1 and t.is_download_success = 'N' '''.format(start_date=start_date_dt, end_date=end_date_dt) if not stock_id is None: chk_sql = chk_sql + ' and t.stock_id = \'' + stock_id + '\'' cur = get_cur(conn) cur.execute(chk_sql) rows = list(cur) if len(rows) == 0: print_log('All the stocks have been downloaded successfully.') else: for row in rows: error_log(str(row['biz_date']) + ':' + row['stock_id'] + ' failed to download.') return len(rows)
def get_query_result(conn, query): cur = get_cur(conn) cur.execute(query) if re.match(r'^(insert|update|delete)', query.strip(), re.IGNORECASE): return 'Command exeucted successfully.' else: return list(cur)
def recent_working_day(in_date='today', is_skip_holiday=False, conn=None): # date=yyyymmdd # if is_skip_holiday=False, return the most recent non-weekend day # if is_skip_holiday=True, return the most recent non-weekend day AND holiday will be skipped as well holidays = [] if re.match("^\d{8}$", in_date): date_date = datetime.datetime.strptime(in_date, '%Y%m%d') else: date_date = get_date(in_date, to_date=True) if is_skip_holiday: if conn is None: raise RuntimeError( 'connection is None which must be available when skip_holiday mode is on.' ) else: cur = get_cur(conn) cur.execute('select date from dw.holiday') # yyyymmdd rows = list(cur) for row in rows: holidays.append(row['date']) cur.close() while date_date.isoweekday() >= 6 or date_date.strftime( '%Y%m%d') in holidays: date_date = date_date + datetime.timedelta(-1) return date_date.strftime('%Y%m%d')
def get_row_id(self): row_id_sql = "select nextval('dw.seq_log_stock_trans_row_id') as row_id" cur = get_cur(self.conn) cur.execute(row_id_sql) db_rows = list(cur) self.row_id = db_rows[0]['row_id'] return self.row_id
def load_log_checker(conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id): start_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d') chk_sql = ''' select biz_date, stock_id from dw.log_stock_transaction where biz_date between '{start_date}' and '{end_date}' and is_download_success = 'Y' and (is_load_success = 'N' or is_load_success is null) '''.format(start_date=start_date_dt, end_date=end_date_dt) if not stock_id is None: chk_sql = chk_sql + ' and stock_id = \'' + stock_id + '\'' cur = get_cur(conn) cur.execute(chk_sql) rows = list(cur) if len(rows) == 0: print_log('All the stocks have been loaded successfully.') else: for row in rows: error_log( str(row['biz_date']) + ':' + row['stock_id'] + ' failed to load.') return len(rows)
def update_log_table(self, is_success=True): ins_sql = '''update dw.log_stock_transaction set download_end_time = '{end_time}', is_download_success = '{is_success}' where row_id = {row_id} '''.format(row_id=self.row_id, end_time=time.ctime(), is_success='Y' if is_success else 'N') cur = get_cur(self.conn) cur.execute(ins_sql) self.conn.commit()
def return_parent_bankuai_ids(db_conn): query = "SELECT ID, NAME FROM DW.DIM_PARENT_BANKUAI" cur = get_cur(db_conn) cur.execute(query) rows = list(cur) return_dict = {} for row in rows: return_dict[row["name"].decode("utf-8")] = row["id"] cur.close() return return_dict
def get_stock_list(conn): # get stock list from db stocks = [] sel_query = "select id from dw.dim_stock where id <> '000000'" cur = get_cur(conn) cur.execute(sel_query) rows = list(cur) for row in rows: stocks.append(row['id']) return stocks
def insert_log_table(self): ins_sql = '''insert into dw.log_stock_transaction ( row_id, biz_date, stock_id, download_start_time, download_source ) values ( {row_id}, '{date}', '{stock}', '{start_time}', '{stock_trans_obj_name}' ) '''.format(row_id=self.row_id, date=self.date, stock=self.stock_id, start_time=time.ctime(), stock_trans_obj_name=self.stock_trans_obj_name) cur = get_cur(self.conn) cur.execute(ins_sql) self.conn.commit()
def get_stock_list(conn): # get stock list from db stocks = [] sel_query = "select id from dw.dim_stock where id <> '000000'" cur = get_cur(conn) cur.execute(sel_query) rows = list(cur) for row in rows: stocks.append(row["id"]) return stocks
def get_stock_list(conn, biz_date, stock_id): # get stock list from db stocks = [] if not stock_id is None: sel_query = ''' select id from dw.dim_stock where id <> '000000' and is_valid = 'Y' and id = '{stock_id}' except select stock_id from dw.log_stock_transaction where biz_date = '{biz_date}' and is_download_success = 'Y' and stock_id = '{stock_id}' '''.format(stock_id=stock_id, biz_date=biz_date) else: sel_query = ''' select id from dw.dim_stock where id <> '000000' and is_valid = 'Y' except select stock_id from dw.log_stock_transaction where biz_date = '{biz_date}' and is_download_success = 'Y' '''.format(biz_date=biz_date) cur = get_cur(conn) cur.execute(sel_query) rows = list(cur) for row in rows: stocks.append(row['id']) return stocks
def load_log_checker(conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id): start_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d') chk_sql = ''' select biz_date, stock_id from dw.log_stock_transaction where biz_date between '{start_date}' and '{end_date}' and is_download_success = 'Y' and (is_load_success = 'N' or is_load_success is null) '''.format(start_date=start_date_dt, end_date=end_date_dt) if not stock_id is None: chk_sql = chk_sql + ' and stock_id = \'' + stock_id + '\'' cur = get_cur(conn) cur.execute(chk_sql) rows = list(cur) if len(rows) == 0: print_log('All the stocks have been loaded successfully.') else: for row in rows: error_log(str(row['biz_date']) + ':' + row['stock_id'] + ' failed to load.') return len(rows)
def download_log_checker(conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id): start_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d') # get stock ids which is_download_success=N chk_sql = ''' select t.biz_date, t.stock_id from ( select biz_date, stock_id, is_download_success, row_number() over(partition by biz_date, stock_id order by download_end_time desc nulls last) rankid from dw.log_stock_transaction where biz_date between '{start_date}' and '{end_date}' ) t where t.rankid = 1 and t.is_download_success = 'N' '''.format(start_date=start_date_dt, end_date=end_date_dt) if not stock_id is None: chk_sql = chk_sql + ' and t.stock_id = \'' + stock_id + '\'' cur = get_cur(conn) cur.execute(chk_sql) rows = list(cur) if len(rows) == 0: print_log('All the stocks have been downloaded successfully.') else: for row in rows: error_log( str(row['biz_date']) + ':' + row['stock_id'] + ' failed to download.') return len(rows)
def recent_working_day(in_date='today', is_skip_holiday=False, conn=None): # date=yyyymmdd # if is_skip_holiday=False, return the most recent non-weekend day # if is_skip_holiday=True, return the most recent non-weekend day AND holiday will be skipped as well holidays = [] if re.match("^\d{8}$", in_date): date_date = datetime.datetime.strptime(in_date, '%Y%m%d') else: date_date = get_date(in_date, to_date=True) if is_skip_holiday: if conn is None: raise RuntimeError('connection is None which must be available when skip_holiday mode is on.') else: cur = get_cur(conn) cur.execute('select date from dw.holiday') # yyyymmdd rows = list(cur) for row in rows: holidays.append(row['date']) cur.close() while date_date.isoweekday() >= 6 or date_date.strftime('%Y%m%d') in holidays: date_date = date_date + datetime.timedelta(-1) return date_date.strftime('%Y%m%d')
def load_into_dim_stock_bankuai(db_conn, file ): #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) bk_st_pairs = [] bk_st_pairs_dict = {} bk_id_dict = {} codes_to_valid = [] codes_to_invalid = [] # 板块 子版块 板块名称 股票代码 股票名称 # 板块 概念板块 送转预期 600587 新华医疗 for row in csvr: bk_name = row[u'板块名称'.encode("gbk")].decode("gbk") st_id = row[u'股票代码'.encode("gbk")].decode("gbk") bk_st_pairs.append([bk_name, st_id]) csvf.close() print_log("%(num)s records have been read from %(fname)s." % {"num": len(bk_st_pairs), "fname": file}) #---- get bankuai_id from dim_bankuai select_sql = "select t.id, t.name from dw.dim_bankuai t" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] bk_id_dict[db_name] = db_id #---- convert to dict for i in range(len(bk_st_pairs)): bk_st_pairs[i][0] = bk_id_dict[bk_st_pairs[i][0]] bk_st_pairs[i].append(str(bk_st_pairs[i][0]) + "-" + str(bk_st_pairs[i][1])) # as PK bk_st_pairs_dict[bk_st_pairs[i][2]] = {"bk": bk_st_pairs[i][0], "st": bk_st_pairs[i][1]} #---- get bk_id, st_id from db, seach the combination in csv dict select_sql = "select t.stock_id, t.bankuai_id, t.is_valid from dw.dim_stock_bankuai t" cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_bk_id = db_row["bankuai_id"] db_st_id = db_row["stock_id"] db_pk = str(db_bk_id) + "-" + db_st_id db_is_valid = db_row["is_valid"] if db_pk in bk_st_pairs_dict and db_is_valid == "Y": del bk_st_pairs_dict[db_pk] elif db_pk in bk_st_pairs_dict and db_is_valid == "N": codes_to_valid.append(" ( bankuai_id = " + str(db_bk_id) + " and stock_id = '" + str(db_st_id) + "' ) ") del bk_st_pairs_dict[db_pk] elif db_is_valid == "N": # not in csv file and it's already invalid in db, do nothing pass else: # not in csv, but in db it's valid, mark it to invalid codes_to_invalid.append(" ( bankuai_id = " + str(db_bk_id) + " and stock_id = '" + str(db_st_id) + "' ) ") #---- mark is_valid=N if len(codes_to_invalid) > 0: codes_to_invalid_str = " or ".join(codes_to_invalid) print_log("There are %(num)s stock bankuai combination will be marked invalid. %(combination)s" % {"num": len(codes_to_invalid), "combination": codes_to_invalid_str}) upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'N', upd_time = now() where %(combinations)s" % {"combinations": codes_to_invalid_str} cur.execute(upd_sql) db_conn.commit() else: print_log("No stock bankuai combinations need to be marked invalid.") #---- mark is_valid=Y if len(codes_to_valid) > 0: codes_to_valid_str = " or ".join(codes_to_valid) print_log("There are %(num)s stock bankuai combination will be marked valid. %(combination)s" % {"num": len(codes_to_valid), "combination": codes_to_valid_str}) upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'Y', upd_time = now() where %(combinations)s" % {"combinations": codes_to_valid_str} cur.execute(upd_sql) db_conn.commit() else: print_log("No stock bankuai combinations need to be marked valid.") #---- insert stocks into dim_stock_bankuai if len(bk_st_pairs_dict.keys()) > 0: values = [] print_log("There are %(num)s stock bankuai combination will be inserted." % {"num": len(bk_st_pairs_dict.keys())}) for pk in bk_st_pairs_dict: print_log(pk) values.append("('%(stock_id)s', '%(bankuai_id)s', now(), 'Y')" % {"stock_id": bk_st_pairs_dict[pk]["st"], "bankuai_id": bk_st_pairs_dict[pk]["bk"]} ) values_str = ",".join(values) ins_sql = "insert into dw.dim_stock_bankuai(stock_id, bankuai_id, upd_time, is_valid) values %(values)s" % {"values": values_str} cur.execute(ins_sql) db_conn.commit() else: print_log("No new stock bankuai combination.") print_log("dw.dim_stock_bankuai has been refreshed successfully.")
# based on the list of recon_fields_in_file, read the corresponding fields in csv and concatenate them together as a PK print_log("Start to read %(file)s..." % {"file": file_to_recon}) for row in csvr: key = [] for i in range(len(file_db_recon[type]["recon_fields_in_file"])): field = file_db_recon[type]["recon_fields_in_file"][i] key.append(row[field.encode("gbk")].decode("gbk")) csv_dict["-".join(key)] = "" print_log("%(num)s records loaded, dict for csv done." % {"num": len(csv_dict.keys()) }) csvf.close() #-- building dict for db # based on the list of recon_fields_in_db, read the corresponding fields in db and concatenate them together as a PK print_log("Start to read db...") select_sql = file_db_recon[type]["sql"] cur = get_cur(conn) cur.execute(select_sql) db_rows = list(cur) for row in db_rows: key = [] for i in range(len(file_db_recon[type]["recon_fields_in_db"])): field = file_db_recon[type]["recon_fields_in_db"][i] key.append(row[field].decode("utf-8")) dbsql_dict["-".join(key)] = "" print_log("%(num)s records loaded, dict for db done." % {"num": len(csv_dict.keys()) }) #------------------------------------------- RECONing print_log("Recon starting >>>") csv_dict_keys = csv_dict.keys() # iterate keys in csv dict, if it is found in db dict, remove it from both dict
def loader(queue, conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id, merge_before_copy=options.merge_before_copy, enable_copy=options.enable_copy): cur_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d') stock_list_sql = ''' select row_id, biz_date, stock_id from dw.log_stock_transaction where biz_date = '{biz_date}' and is_download_success = 'Y' and (is_load_success = 'N' or is_load_success is null) ''' if not stock_id is None: stock_list_sql = stock_list_sql + ' and stock_id = \'' + stock_id + '\'' cur = get_cur(conn) while cur_date_dt <= end_date_dt: if merge_before_copy: # since load files one by one into table is taking too much time, the solution to boost the procedure is to merge all the pieces of files into one file and load the merge file into table, this takes less than 5 mins to complete. cur_date_str = cur_date_dt.strftime('%Y%m%d') working_dir = data_dir + SEP + cur_date_str file_merged = os.path.join(working_dir, "file_merged.csv") if os.path.exists(file_merged): warn_log('Removing old file: ' + file_merged) os.remove(file_merged) #-- Starting to merge files with open(file_merged, "a") as dest: i=0 for _, _, filenames in os.walk(working_dir): for filename in fnmatch.filter(filenames, "[0-9]*.txt"): with open(os.path.join(working_dir, filename)) as src: shutil.copyfileobj(src, dest) i+=1 print_log('Merged ' + str(i) + ' files.') #-- Deleting records from db del_sql = '''delete from dw.stock_transaction where biz_date = '{}' '''.format(cur_date_str) get_query_result(conn, del_sql) conn.commit() print_log('Deletion for biz_date {} completed successfully.'.format(cur_date_str)) #-- Updating is_load_success to N in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'N' where biz_date = '{}' and is_download_success = 'Y' '''.format(cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to N') #++++++++ Starting to load the merged file into table psql_copy_from(DB_HOST, DB_NAME, DB_UNAME, 'dw.stock_transaction', file_merged, DB_PORT, args=' with (encoding \'GBK\')') print_log('Successfully loaded {} into table.'.format(file_merged)) #-- Updating is_load_success to Y in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'Y' where biz_date = '{}' and is_download_success = 'Y' '''.format(cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to Y') #-- Cleaning up working dir os.remove(file_merged) cur_date_dt = cur_date_dt + datetime.timedelta(1) else: stock_list_sql_var_replaced = stock_list_sql.format(biz_date=cur_date_dt) cur.execute(stock_list_sql_var_replaced) rows = list(cur) for row in rows: row_id = row['row_id'] biz_date = str(row['biz_date']).replace('-','') stock_id = row['stock_id'] while queue.full(): print_log('=================> queue is full, wait for 1 second...') time.sleep(1) s = Stock_trans_loader(queue, conn, row_id, stock_id, biz_date, enable_copy=enable_copy ) s.start() print_log('-----> queue size: ' + str(queue.qsize())) conn.commit() cur_date_dt = cur_date_dt + datetime.timedelta(1) while not queue.empty(): print_log('=================> queue is not empty yet, wait for 1 second...') time.sleep(1)
print_log("Start to read %(file)s..." % {"file": file_to_recon}) for row in csvr: key = [] for i in range(len(file_db_recon[type]["recon_fields_in_file"])): field = file_db_recon[type]["recon_fields_in_file"][i] key.append(row[field.encode("gbk")].decode("gbk")) csv_dict["-".join(key)] = "" print_log("%(num)s records loaded, dict for csv done." % {"num": len(csv_dict.keys())}) csvf.close() #-- building dict for db # based on the list of recon_fields_in_db, read the corresponding fields in db and concatenate them together as a PK print_log("Start to read db...") select_sql = file_db_recon[type]["sql"] cur = get_cur(conn) cur.execute(select_sql) db_rows = list(cur) for row in db_rows: key = [] for i in range(len(file_db_recon[type]["recon_fields_in_db"])): field = file_db_recon[type]["recon_fields_in_db"][i] key.append(row[field].decode("utf-8")) dbsql_dict["-".join(key)] = "" print_log("%(num)s records loaded, dict for db done." % {"num": len(csv_dict.keys())}) #------------------------------------------- RECONing print_log("Recon starting >>>") csv_dict_keys = csv_dict.keys() # iterate keys in csv dict, if it is found in db dict, remove it from both dict
def loader(queue, conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id, merge_before_copy=options.merge_before_copy, enable_copy=options.enable_copy): cur_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d') stock_list_sql = ''' select row_id, biz_date, stock_id from dw.log_stock_transaction where biz_date = '{biz_date}' and is_download_success = 'Y' and (is_load_success = 'N' or is_load_success is null) ''' if not stock_id is None: stock_list_sql = stock_list_sql + ' and stock_id = \'' + stock_id + '\'' cur = get_cur(conn) while cur_date_dt <= end_date_dt: if merge_before_copy: # since load files one by one into table is taking too much time, the solution to boost the procedure is to merge all the pieces of files into one file and load the merge file into table, this takes less than 5 mins to complete. cur_date_str = cur_date_dt.strftime('%Y%m%d') working_dir = data_dir + SEP + cur_date_str file_merged = os.path.join(working_dir, "file_merged.csv") if os.path.exists(file_merged): warn_log('Removing old file: ' + file_merged) os.remove(file_merged) #-- Starting to merge files with open(file_merged, "a") as dest: i = 0 for _, _, filenames in os.walk(working_dir): for filename in fnmatch.filter(filenames, "[0-9]*.txt"): with open(os.path.join(working_dir, filename)) as src: shutil.copyfileobj(src, dest) i += 1 print_log('Merged ' + str(i) + ' files.') #-- Deleting records from db del_sql = '''delete from dw.stock_transaction where biz_date = '{}' '''.format( cur_date_str) get_query_result(conn, del_sql) conn.commit() print_log( 'Deletion for biz_date {} completed successfully.'.format( cur_date_str)) #-- Updating is_load_success to N in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'N' where biz_date = '{}' and is_download_success = 'Y' '''.format( cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to N') #++++++++ Starting to load the merged file into table psql_copy_from(DB_HOST, DB_NAME, DB_UNAME, 'dw.stock_transaction', file_merged, DB_PORT, args=' with (encoding \'GBK\')') print_log('Successfully loaded {} into table.'.format(file_merged)) #-- Updating is_load_success to Y in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'Y' where biz_date = '{}' and is_download_success = 'Y' '''.format( cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to Y') #-- Cleaning up working dir os.remove(file_merged) cur_date_dt = cur_date_dt + datetime.timedelta(1) else: stock_list_sql_var_replaced = stock_list_sql.format( biz_date=cur_date_dt) cur.execute(stock_list_sql_var_replaced) rows = list(cur) for row in rows: row_id = row['row_id'] biz_date = str(row['biz_date']).replace('-', '') stock_id = row['stock_id'] while queue.full(): print_log( '=================> queue is full, wait for 1 second...' ) time.sleep(1) s = Stock_trans_loader(queue, conn, row_id, stock_id, biz_date, enable_copy=enable_copy) s.start() print_log('-----> queue size: ' + str(queue.qsize())) conn.commit() cur_date_dt = cur_date_dt + datetime.timedelta(1) while not queue.empty(): print_log( '=================> queue is not empty yet, wait for 1 second...') time.sleep(1)
def insert_into_table(db_field_yaml, stock_obj_name, in_file, conn, log_fh, warn_fh): # based on the fields mapping between db and object, db type defined in yaml, generate delete sql and insert sql, and fire to db # this function could be used for any db insert, if yaml and object are setup properly # Yaml example # biz_date: # type: date # is_pk: Y # stock_object: # Tengxun_stock: date from object_impl.Sina_stock import Sina_stock from object_impl.Tengxun_stock import Tengxun_stock from object_impl.Yahoo_stock import Yahoo_stock db_field_mapping = get_yaml(db_field_yaml) tab_name = os.path.basename(db_field_yaml).replace( '.yml', '') # yml file name as table name tab_fields = [] # table field names tab_pk = [] # table pk tab_types = [] # table field types obj_attrs = [] # attribute names in stock object for k, v in db_field_mapping.items(): tab_type = v['type'] obj_attr = v['stock_object'][stock_obj_name] if obj_attr != None: # If None|Null is set for fields in yml, remove the fields from insertion tab_fields.append(k) if v['is_pk'] == 'Y': tab_pk.append(k) # pk, delete before insert tab_types.append(tab_type) obj_attrs.append(obj_attr) del_sql = 'delete from {tab_name} where 1=1 '.format(tab_name=tab_name) ins_sql = 'insert into {tab_name}({fields}) '.format( tab_name=tab_name, fields=','.join(tab_fields)) # iterate each row in the file, insert into table num = 0 with open(in_file) as f: for row in f.readlines(): # get_stock_object_from_str is a function should be available in all the stock objects # this function accepts the string returned from website and generate a dict for stock object # the dict is like {stock: {date: object}} # dynamically import object module, class name and file name should be identical #exec('from object_impl.{object} import {object}'.format(object = stock_obj_name), globals()) stock_dict = eval('{object}.get_stock_object_from_str(row)'.format( object=stock_obj_name, row=row)) for stock in stock_dict: # for Tengxun or sina interface, there is just one stock in one stock dict for date in stock_dict[ stock]: # for Tengxun or sina interface, there is just one date in one stock dict stock_obj = stock_dict[stock][ date] # this object is stock implementation object value_sql = reduce( lambda x, y: (x if re.match(r'stock_obj', x) else 'stock_obj.' + x + ', ') + "stock_obj.{attr_name}, ".format(attr_name=y), obj_attrs ) # add 'stock_obj.' to the first attr, and concatenate attrs to a string value_sql = value_sql[ 0: -2] # remove the last comma and the blankspace next to it value_sql = eval(value_sql) # tupe returned final_value_sql = '' del_where = '' for i, v in enumerate(value_sql): value = "'" + v + "'" if tab_types[ i] == 'date' or tab_types[ i] == 'varchar' else 'Null' if len( str(v) ) == 0 else str( v ) # date and varchar quoted by single quote, otherwise no quote or null(if length of value is 0) final_value_sql = final_value_sql + value + ', ' if tab_fields[i] in tab_pk: del_where = del_where + ' and {field}={value}'.format( field=tab_fields[i], value=value) final_value_sql = final_value_sql[0:-2] del_complete_sql = del_sql + del_where ins_complete_sql = ins_sql + ' values( ' + final_value_sql + ')' #print_log('Deleting [{stock},{date}] from {tab_name}...\n {sql}'.format(stock=stock,date=date,tab_name=tab_name,sql=del_complete_sql), log_fh) cur = get_cur(conn) cur.execute(del_complete_sql) cur.execute(ins_complete_sql) print_log( 'Inserted [{stock},{date}] into {tab_name}.'.format( stock=stock, date=date, tab_name=tab_name), log_fh) num += 1 if num % 1000 == 0: conn.commit() conn.commit() print_log( '{num} records have been written into {tab_name}.'.format( num=num, tab_name=tab_name), log_fh)
def load_into_bankuai(db_conn, file, biz_date=None ): # 板块 子版块 板块名称 涨跌幅 总市值(亿) 换手率 上涨家数 下跌家数 领涨股票代码 领涨股票 领涨股票涨跌幅 # 板块 概念板块 全息技术 3.95% 365.12 11.65 7 1 600288 大恒科技 10.03 # 板块 概念板块 网络安全 2.95% 818.79 25.61 19 1 002308 威创股份 10.01 # biz_date date not null, # bankuai_id integer not null, # rise varchar(16), # market_value_in_million decimal(12,2), # turnover_rate decimal(5,2), # num_of_rise integer, # num_of_drop integer, # leading_stock_id varchar(6), # rise_of_leading_stock decimal(10,2), # primary key(biz_date, bankuai_id) bk_id_dict = {} csv_data = [] v_biz_date = "" #-- build dict for bankuai name and bankuai id from db select_sql = 'select t.name, t.id from dw.dim_bankuai t' cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] bk_id_dict[db_name] = db_id print_log("There are %(num)s records read from %(name)s" % {"num": len(bk_id_dict.keys()), "name": 'dw.dim_bankuai'}) #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) for row in csvr: bk_name = row[u'板块名称'.encode("gbk")].decode("gbk") bk_id = bk_id_dict[bk_name] row_dict = {} row_dict[bk_id] = {} row_dict[bk_id]["rise"] = row[u'涨跌幅'.encode("gbk")].decode("gbk") row_dict[bk_id]["market_value_in_million"] = row[u'总市值(亿)'.encode("gbk")] row_dict[bk_id]["turnover_rate"] = row[u'换手率'.encode("gbk")] row_dict[bk_id]["num_of_rise"] = row[u'上涨家数'.encode("gbk")] row_dict[bk_id]["num_of_drop"] = row[u'下跌家数'.encode("gbk")] row_dict[bk_id]["leading_stock_id"] = row[u'领涨股票代码'.encode("gbk")] row_dict[bk_id]["rise_of_leading_stock"] = row[u'领涨股票涨跌幅'.encode("gbk")] csv_data.append(row_dict) csvf.close() print_log("%(num)s records have been read from %(name)s." % {"num": len(csv_data), "name": file}) #-- determine biz_date if not biz_date is None: if re.search(r'\d{8}', biz_date): v_biz_date = biz_date else: raise RuntimeError(biz_date + " is not a valid date format, the date should be like YYYYMMDD.") elif re.search(r'.*(?P<date>\d{8})\.csv', file): v_biz_date = re.search(r'.*(?P<date>\d{8})\.csv', file).group("date") else: raise RuntimeError('Can not determine biz_date, please check if file name has date included or pass biz_date when calling the function.') v_biz_date_dt = datetime.datetime.strptime(v_biz_date,'%Y%m%d') #-- delete biz_date from dw.bankuai del_sql = 'delete from dw.bankuai where biz_date = \'%(date)s \'' % {'date': v_biz_date_dt} cur.execute(del_sql) db_conn.commit() print_log("Deleted records from dw.bankuai where biz_date = '%(biz_date)s'." % {"biz_date": v_biz_date}) #-- insert into dw.bankuai iter = 0 for r in csv_data: k = r.keys()[0] iter += 1 ins_sql = '''insert into dw.bankuai( biz_date, bankuai_id, rise, market_value_in_million, turnover_rate, num_of_rise, num_of_drop, leading_stock_id, rise_of_leading_stock) values( '%(biz_date)s', %(bankuai_id)s, '%(rise)s', %(market_value_in_million)s, %(turnover_rate)s, %(num_of_rise)s, %(num_of_drop)s, '%(leading_stock_id)s', %(rise_of_leading_stock)s )''' % { 'biz_date': v_biz_date_dt, 'bankuai_id': k, 'rise': r[k]['rise'], 'market_value_in_million': r[k]['market_value_in_million'], 'turnover_rate': r[k]['turnover_rate'], 'num_of_rise': r[k]['num_of_rise'], 'num_of_drop': r[k]['num_of_drop'], 'leading_stock_id': r[k]['leading_stock_id'] if r[k]['leading_stock_id'] != '-' else '000000', # sometimes eastmoney doesn't return valid leading stock id, but '-', for this case, '000000' would replace it as an unknown stock id 'rise_of_leading_stock': r[k]['rise_of_leading_stock'] } cur.execute(ins_sql) db_conn.commit() print_log( str(iter) + " inserted into dw.bankuai.") print_log("dw.bankuai has been refreshed successfully.")
def insert_into_table(db_field_yaml, stock_obj_name, in_file, conn, log_fh, warn_fh): # based on the fields mapping between db and object, db type defined in yaml, generate delete sql and insert sql, and fire to db # this function could be used for any db insert, if yaml and object are setup properly # Yaml example # biz_date: # type: date # is_pk: Y # stock_object: # Tengxun_stock: date from object_impl.Sina_stock import Sina_stock from object_impl.Tengxun_stock import Tengxun_stock from object_impl.Yahoo_stock import Yahoo_stock db_field_mapping = get_yaml(db_field_yaml) tab_name = os.path.basename(db_field_yaml).replace('.yml', '') # yml file name as table name tab_fields = [] # table field names tab_pk = [] # table pk tab_types = [] # table field types obj_attrs = [] # attribute names in stock object for k,v in db_field_mapping.items(): tab_type = v['type'] obj_attr = v['stock_object'][stock_obj_name] if obj_attr != None: # If None|Null is set for fields in yml, remove the fields from insertion tab_fields.append(k) if v['is_pk'] == 'Y': tab_pk.append(k) # pk, delete before insert tab_types.append(tab_type) obj_attrs.append(obj_attr) del_sql = 'delete from {tab_name} where 1=1 '.format(tab_name=tab_name) ins_sql = 'insert into {tab_name}({fields}) '.format(tab_name=tab_name, fields=','.join(tab_fields)) # iterate each row in the file, insert into table num = 0 with open(in_file) as f: for row in f.readlines(): # get_stock_object_from_str is a function should be available in all the stock objects # this function accepts the string returned from website and generate a dict for stock object # the dict is like {stock: {date: object}} # dynamically import object module, class name and file name should be identical #exec('from object_impl.{object} import {object}'.format(object = stock_obj_name), globals()) stock_dict = eval('{object}.get_stock_object_from_str(row)'.format(object=stock_obj_name, row=row)) for stock in stock_dict: # for Tengxun or sina interface, there is just one stock in one stock dict for date in stock_dict[stock]: # for Tengxun or sina interface, there is just one date in one stock dict stock_obj = stock_dict[stock][date] # this object is stock implementation object value_sql = reduce(lambda x, y: ( x if re.match(r'stock_obj', x) else 'stock_obj.' + x + ', ' ) + "stock_obj.{attr_name}, ".format(attr_name=y), obj_attrs) # add 'stock_obj.' to the first attr, and concatenate attrs to a string value_sql = value_sql[0:-2] # remove the last comma and the blankspace next to it value_sql = eval(value_sql) # tupe returned final_value_sql = '' del_where = '' for i, v in enumerate(value_sql): value = "'" + v + "'" if tab_types[i] == 'date' or tab_types[i] == 'varchar' else 'Null' if len(str(v)) == 0 else str(v) # date and varchar quoted by single quote, otherwise no quote or null(if length of value is 0) final_value_sql = final_value_sql + value + ', ' if tab_fields[i] in tab_pk: del_where = del_where + ' and {field}={value}'.format(field=tab_fields[i], value=value) final_value_sql = final_value_sql[0:-2] del_complete_sql = del_sql + del_where ins_complete_sql = ins_sql + ' values( ' + final_value_sql + ')' #print_log('Deleting [{stock},{date}] from {tab_name}...\n {sql}'.format(stock=stock,date=date,tab_name=tab_name,sql=del_complete_sql), log_fh) cur = get_cur(conn) cur.execute(del_complete_sql) cur.execute(ins_complete_sql) print_log('Inserted [{stock},{date}] into {tab_name}.'.format(stock=stock,date=date,tab_name=tab_name), log_fh) num += 1 if num % 1000 == 0: conn.commit() conn.commit() print_log('{num} records have been written into {tab_name}.'.format(num=num, tab_name=tab_name), log_fh)
def load_into_dim_bankuai(db_conn, file, parent_bankuai_ids={u"概念板块": 1, u"地域板块": 2, u"行业板块": 3}): # -- load CSV csvf = open(file) csvr = csv.DictReader(csvf) bankuais = {} invalid_bankuai_ids = [] # ---- get parent_bankuai_id, bankuai_name from csv for row in csvr: bankuai = row[u"板块名称".encode("gbk")].decode("gbk") parent_bankuai = row[u"子版块".encode("gbk")].decode("gbk") parent_bankuai_id = parent_bankuai_ids[parent_bankuai] bankuais[bankuai] = {} bankuais[bankuai]["parent_bankuai_id"] = parent_bankuai_id # bankuais[bankuai].setdefault("parent_bankuai_id", parent_bankuai_id) csvf.close() print_log("%(num)s records have been read from %(fname)s." % {"num": len(bankuais.keys()), "fname": file}) # ---- get parent_bankuai_id, bankuai_name from db, seach the combination in csv dict, if it doesn't exist, add to invalid_bankuai_ids select_sql = "select t.parent_bankuai_id, t.name, t.id from dw.dim_bankuai t where t.is_valid = 'Y'" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_bankuai = db_row["name"].decode("utf-8") db_parent_bankuai_id = db_row["parent_bankuai_id"] db_id = db_row["id"] if db_bankuai in bankuais: if db_parent_bankuai_id == bankuais[db_bankuai]["parent_bankuai_id"]: # delete from bankuais if it's already in the table and is_valid=Y del bankuais[db_bankuai] else: invalid_bankuai_ids.append(str(db_id)) else: invalid_bankuai_ids.append(str(db_id)) # ---- mark bankuais is_valid=N if len(invalid_bankuai_ids) > 0: invalid_bankuai_ids_str = ",".join(invalid_bankuai_ids) print_log("Invalid bankuai ids: " + invalid_bankuai_ids_str) upd_sql = "update dw.dim_bankuai t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % { "ids": invalid_bankuai_ids_str } cur.execute(upd_sql) db_conn.commit() else: print_log("No invalid bankuai ids.") # ---- insert bankuais into dim_bankuai if len(bankuais.keys()) > 0: values = [] print_log("There are %(num)s bankuais will be inserted." % {"num": len(bankuais.keys())}) for b in bankuais: values.append( "('%(name)s', '%(parent_bankuai_id)s', now(), 'Y')" % {"name": b, "parent_bankuai_id": bankuais[b]["parent_bankuai_id"]} ) values_str = ",".join(values) ins_sql = "insert into dw.dim_bankuai(name, parent_bankuai_id, upd_time, is_valid) values %(values)s" % { "values": values_str } cur.execute(ins_sql) db_conn.commit() else: print_log("No new bankuai ids.") print_log("dw.dim_bankuai has been refreshed successfully.")
def load_into_bankuai(db_conn, file, biz_date=None): # 板块 子版块 板块名称 涨跌幅 总市值(亿) 换手率 上涨家数 下跌家数 领涨股票代码 领涨股票 领涨股票涨跌幅 # 板块 概念板块 全息技术 3.95% 365.12 11.65 7 1 600288 大恒科技 10.03 # 板块 概念板块 网络安全 2.95% 818.79 25.61 19 1 002308 威创股份 10.01 # biz_date date not null, # bankuai_id integer not null, # rise varchar(16), # market_value_in_million decimal(12,2), # turnover_rate decimal(5,2), # num_of_rise integer, # num_of_drop integer, # leading_stock_id varchar(6), # rise_of_leading_stock decimal(10,2), # primary key(biz_date, bankuai_id) bk_id_dict = {} csv_data = [] v_biz_date = "" #-- build dict for bankuai name and bankuai id from db select_sql = 'select t.name, t.id from dw.dim_bankuai t' cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] bk_id_dict[db_name] = db_id print_log("There are %(num)s records read from %(name)s" % { "num": len(bk_id_dict.keys()), "name": 'dw.dim_bankuai' }) #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) for row in csvr: bk_name = row[u'板块名称'.encode("gbk")].decode("gbk") bk_id = bk_id_dict[bk_name] row_dict = {} row_dict[bk_id] = {} row_dict[bk_id]["rise"] = row[u'涨跌幅'.encode("gbk")].decode("gbk") row_dict[bk_id]["market_value_in_million"] = row[u'总市值(亿)'.encode( "gbk")] row_dict[bk_id]["turnover_rate"] = row[u'换手率'.encode("gbk")] row_dict[bk_id]["num_of_rise"] = row[u'上涨家数'.encode("gbk")] row_dict[bk_id]["num_of_drop"] = row[u'下跌家数'.encode("gbk")] row_dict[bk_id]["leading_stock_id"] = row[u'领涨股票代码'.encode("gbk")] row_dict[bk_id]["rise_of_leading_stock"] = row[u'领涨股票涨跌幅'.encode( "gbk")] csv_data.append(row_dict) csvf.close() print_log("%(num)s records have been read from %(name)s." % { "num": len(csv_data), "name": file }) #-- determine biz_date if not biz_date is None: if re.search(r'\d{8}', biz_date): v_biz_date = biz_date else: raise RuntimeError( biz_date + " is not a valid date format, the date should be like YYYYMMDD." ) elif re.search(r'.*(?P<date>\d{8})\.csv', file): v_biz_date = re.search(r'.*(?P<date>\d{8})\.csv', file).group("date") else: raise RuntimeError( 'Can not determine biz_date, please check if file name has date included or pass biz_date when calling the function.' ) v_biz_date_dt = datetime.datetime.strptime(v_biz_date, '%Y%m%d') #-- delete biz_date from dw.bankuai del_sql = 'delete from dw.bankuai where biz_date = \'%(date)s \'' % { 'date': v_biz_date_dt } cur.execute(del_sql) db_conn.commit() print_log( "Deleted records from dw.bankuai where biz_date = '%(biz_date)s'." % {"biz_date": v_biz_date}) #-- insert into dw.bankuai iter = 0 for r in csv_data: k = r.keys()[0] iter += 1 ins_sql = '''insert into dw.bankuai( biz_date, bankuai_id, rise, market_value_in_million, turnover_rate, num_of_rise, num_of_drop, leading_stock_id, rise_of_leading_stock) values( '%(biz_date)s', %(bankuai_id)s, '%(rise)s', %(market_value_in_million)s, %(turnover_rate)s, %(num_of_rise)s, %(num_of_drop)s, '%(leading_stock_id)s', %(rise_of_leading_stock)s )''' % { 'biz_date': v_biz_date_dt, 'bankuai_id': k, 'rise': r[k]['rise'], 'market_value_in_million': r[k]['market_value_in_million'], 'turnover_rate': r[k]['turnover_rate'], 'num_of_rise': r[k]['num_of_rise'], 'num_of_drop': r[k]['num_of_drop'], 'leading_stock_id': r[k]['leading_stock_id'] if r[k]['leading_stock_id'] != '-' else '000000', # sometimes eastmoney doesn't return valid leading stock id, but '-', for this case, '000000' would replace it as an unknown stock id 'rise_of_leading_stock': r[k]['rise_of_leading_stock'] } cur.execute(ins_sql) db_conn.commit() print_log(str(iter) + " inserted into dw.bankuai.") print_log("dw.bankuai has been refreshed successfully.")
def load_into_dim_stock_bankuai(db_conn, file): #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) bk_st_pairs = [] bk_st_pairs_dict = {} bk_id_dict = {} codes_to_valid = [] codes_to_invalid = [] # 板块 子版块 板块名称 股票代码 股票名称 # 板块 概念板块 送转预期 600587 新华医疗 for row in csvr: bk_name = row[u'板块名称'.encode("gbk")].decode("gbk") st_id = row[u'股票代码'.encode("gbk")].decode("gbk") bk_st_pairs.append([bk_name, st_id]) csvf.close() print_log("%(num)s records have been read from %(fname)s." % { "num": len(bk_st_pairs), "fname": file }) #---- get bankuai_id from dim_bankuai select_sql = "select t.id, t.name from dw.dim_bankuai t" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] bk_id_dict[db_name] = db_id #---- convert to dict for i in range(len(bk_st_pairs)): bk_st_pairs[i][0] = bk_id_dict[bk_st_pairs[i][0]] bk_st_pairs[i].append( str(bk_st_pairs[i][0]) + "-" + str(bk_st_pairs[i][1])) # as PK bk_st_pairs_dict[bk_st_pairs[i][2]] = { "bk": bk_st_pairs[i][0], "st": bk_st_pairs[i][1] } #---- get bk_id, st_id from db, seach the combination in csv dict select_sql = "select t.stock_id, t.bankuai_id, t.is_valid from dw.dim_stock_bankuai t" cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_bk_id = db_row["bankuai_id"] db_st_id = db_row["stock_id"] db_pk = str(db_bk_id) + "-" + db_st_id db_is_valid = db_row["is_valid"] if db_pk in bk_st_pairs_dict and db_is_valid == "Y": del bk_st_pairs_dict[db_pk] elif db_pk in bk_st_pairs_dict and db_is_valid == "N": codes_to_valid.append(" ( bankuai_id = " + str(db_bk_id) + " and stock_id = '" + str(db_st_id) + "' ) ") del bk_st_pairs_dict[db_pk] elif db_is_valid == "N": # not in csv file and it's already invalid in db, do nothing pass else: # not in csv, but in db it's valid, mark it to invalid codes_to_invalid.append(" ( bankuai_id = " + str(db_bk_id) + " and stock_id = '" + str(db_st_id) + "' ) ") #---- mark is_valid=N if len(codes_to_invalid) > 0: codes_to_invalid_str = " or ".join(codes_to_invalid) print_log( "There are %(num)s stock bankuai combination will be marked invalid. %(combination)s" % { "num": len(codes_to_invalid), "combination": codes_to_invalid_str }) upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'N', upd_time = now() where %(combinations)s" % { "combinations": codes_to_invalid_str } cur.execute(upd_sql) db_conn.commit() else: print_log("No stock bankuai combinations need to be marked invalid.") #---- mark is_valid=Y if len(codes_to_valid) > 0: codes_to_valid_str = " or ".join(codes_to_valid) print_log( "There are %(num)s stock bankuai combination will be marked valid. %(combination)s" % { "num": len(codes_to_valid), "combination": codes_to_valid_str }) upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'Y', upd_time = now() where %(combinations)s" % { "combinations": codes_to_valid_str } cur.execute(upd_sql) db_conn.commit() else: print_log("No stock bankuai combinations need to be marked valid.") #---- insert stocks into dim_stock_bankuai if len(bk_st_pairs_dict.keys()) > 0: values = [] print_log( "There are %(num)s stock bankuai combination will be inserted." % {"num": len(bk_st_pairs_dict.keys())}) for pk in bk_st_pairs_dict: print_log(pk) values.append( "('%(stock_id)s', '%(bankuai_id)s', now(), 'Y')" % { "stock_id": bk_st_pairs_dict[pk]["st"], "bankuai_id": bk_st_pairs_dict[pk]["bk"] }) values_str = ",".join(values) ins_sql = "insert into dw.dim_stock_bankuai(stock_id, bankuai_id, upd_time, is_valid) values %(values)s" % { "values": values_str } cur.execute(ins_sql) db_conn.commit() else: print_log("No new stock bankuai combination.") print_log("dw.dim_stock_bankuai has been refreshed successfully.")
def load_into_dim_stock(db_conn, file): #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) codes = {} codes_to_update = {} codes_to_valid = [] codes_to_invalid = [] # 板块 子版块 板块名称 股票代码 股票名称 # 板块 概念板块 送转预期 600587 新华医疗 for row in csvr: code = row[u'股票代码'.encode("gbk")].decode("gbk") name = row[u'股票名称'.encode("gbk")].decode("gbk") codes[code] = name csvf.close() print_log("%(num)s records have been read from %(fname)s." % { "num": len(codes.keys()), "fname": file }) #---- get id, name from db, seach the combination in csv dict # if id exists but different name, update # if id doesn't exist, mark is_valid=N select_sql = "select t.id, t.name, t.is_valid from dw.dim_stock t /*where t.is_valid = 'Y'*/" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] db_is_valid = db_row["is_valid"] if db_id in codes and db_is_valid == "Y": if db_name == codes[db_id]: #delete from codes if it's already in the table and name is not changed. del codes[db_id] else: #delete from codes, we will use codes_to_update dict to update the name codes_to_update[db_id] = codes[db_id] del codes[db_id] elif db_id in codes and db_is_valid == "N": codes_to_valid.append("'" + str(db_id) + "'") del codes[db_id] elif db_is_valid == "N": # not in csv file and it's already invalid in db, do nothing pass else: # not in csv, but in db it's valid, mark it to invalid codes_to_invalid.append("'" + str(db_id) + "'") #---- mark stocks is_valid=N if len(codes_to_invalid) > 0: codes_to_invalid_str = ",".join(codes_to_invalid) #print_log("Mark stock ids to invalid: " + codes_to_invalid_str) print_log( "There are %(num)s stocks will be marked invalid. %(stocks)s" % { "num": len(codes_to_invalid), "stocks": codes_to_invalid_str }) upd_sql = "update dw.dim_stock t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % { "ids": codes_to_invalid_str } cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be marked invalid.") #---- mark stocks is_valid=Y if len(codes_to_valid) > 0: codes_to_valid_str = ",".join(codes_to_valid) print_log("There are %(num)s stocks will be marked valid. %(stocks)s" % { "num": len(codes_to_valid), "stocks": codes_to_valid_str }) upd_sql = "update dw.dim_stock t set is_valid = 'Y', upd_time = now() where t.id in (%(ids)s)" % { "ids": codes_to_valid_str } cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be marked valid.") #---- update stock names in dim_stock if len(codes_to_update.keys()) > 0: print_log("There are %(num)s stocks will be updated." % {"num": len(codes_to_update.keys())}) for id in codes_to_update: print_log(id) upd_sql = "update dw.dim_stock t set name = '%(name)s', upd_time = now() where t.id = '%(id)s'" % { "id": id, "name": codes_to_update[id] } cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be updated.") #---- insert stocks into dim_stock if len(codes.keys()) > 0: values = [] print_log("There are %(num)s stocks will be inserted." % {"num": len(codes.keys())}) for b in codes: print_log(b) values.append("('%(id)s', '%(name)s', now(), 'Y')" % { "id": b, "name": codes[b] }) values_str = ",".join(values) ins_sql = "insert into dw.dim_stock(id, name, upd_time, is_valid) values %(values)s" % { "values": values_str } cur.execute(ins_sql) db_conn.commit() else: print_log("No new stock ids.") print_log("dw.dim_stock has been refreshed successfully.")
def load_into_dim_bankuai(db_conn, file, parent_bankuai_ids={u'概念板块': 1, u'地域板块': 2, u'行业板块': 3} ): #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) bankuais = {} invalid_bankuai_ids = [] #---- get parent_bankuai_id, bankuai_name from csv for row in csvr: bankuai = row[u'板块名称'.encode("gbk")].decode("gbk") parent_bankuai = row[u'子版块'.encode("gbk")].decode("gbk") parent_bankuai_id = parent_bankuai_ids[parent_bankuai] bankuais[bankuai] = {} bankuais[bankuai]["parent_bankuai_id"] = parent_bankuai_id #bankuais[bankuai].setdefault("parent_bankuai_id", parent_bankuai_id) csvf.close() print_log("%(num)s records have been read from %(fname)s." % {"num": len(bankuais.keys()), "fname": file}) #---- get parent_bankuai_id, bankuai_name from db, seach the combination in csv dict, if it doesn't exist, add to invalid_bankuai_ids select_sql = "select t.parent_bankuai_id, t.name, t.id from dw.dim_bankuai t where t.is_valid = 'Y'" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_bankuai = db_row["name"].decode("utf-8") db_parent_bankuai_id = db_row["parent_bankuai_id"] db_id = db_row["id"] if db_bankuai in bankuais: if db_parent_bankuai_id == bankuais[db_bankuai]["parent_bankuai_id"]: #delete from bankuais if it's already in the table and is_valid=Y del bankuais[db_bankuai] else: invalid_bankuai_ids.append(str(db_id)) else: invalid_bankuai_ids.append(str(db_id)) #---- mark bankuais is_valid=N if len(invalid_bankuai_ids) > 0: invalid_bankuai_ids_str = ",".join(invalid_bankuai_ids) print_log("Invalid bankuai ids: " + invalid_bankuai_ids_str) upd_sql = "update dw.dim_bankuai t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % {"ids": invalid_bankuai_ids_str} cur.execute(upd_sql) db_conn.commit() else: print_log("No invalid bankuai ids.") #---- insert bankuais into dim_bankuai if len(bankuais.keys()) > 0: values = [] print_log("There are %(num)s bankuais will be inserted." % {"num": len(bankuais.keys())}) for b in bankuais: values.append("('%(name)s', '%(parent_bankuai_id)s', now(), 'Y')" % {"name": b, "parent_bankuai_id": bankuais[b]["parent_bankuai_id"]} ) values_str = ",".join(values) ins_sql = "insert into dw.dim_bankuai(name, parent_bankuai_id, upd_time, is_valid) values %(values)s" % {"values": values_str} cur.execute(ins_sql) db_conn.commit() else: print_log("No new bankuai ids.") print_log("dw.dim_bankuai has been refreshed successfully.")
def load_into_dim_stock(db_conn, file ): #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) codes = {} codes_to_update = {} codes_to_valid = [] codes_to_invalid = [] # 板块 子版块 板块名称 股票代码 股票名称 # 板块 概念板块 送转预期 600587 新华医疗 for row in csvr: code = row[u'股票代码'.encode("gbk")].decode("gbk") name = row[u'股票名称'.encode("gbk")].decode("gbk") codes[code] = name csvf.close() print_log("%(num)s records have been read from %(fname)s." % {"num": len(codes.keys()), "fname": file}) #---- get id, name from db, seach the combination in csv dict # if id exists but different name, update # if id doesn't exist, mark is_valid=N select_sql = "select t.id, t.name, t.is_valid from dw.dim_stock t /*where t.is_valid = 'Y'*/" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] db_is_valid = db_row["is_valid"] if db_id in codes and db_is_valid == "Y": if db_name == codes[db_id]: #delete from codes if it's already in the table and name is not changed. del codes[db_id] else: #delete from codes, we will use codes_to_update dict to update the name codes_to_update[db_id] = codes[db_id] del codes[db_id] elif db_id in codes and db_is_valid == "N": codes_to_valid.append("'" + str(db_id) + "'") del codes[db_id] elif db_is_valid == "N": # not in csv file and it's already invalid in db, do nothing pass else: # not in csv, but in db it's valid, mark it to invalid codes_to_invalid.append("'" + str(db_id) + "'") #---- mark stocks is_valid=N if len(codes_to_invalid) > 0: codes_to_invalid_str = ",".join(codes_to_invalid) #print_log("Mark stock ids to invalid: " + codes_to_invalid_str) print_log("There are %(num)s stocks will be marked invalid. %(stocks)s" % {"num": len(codes_to_invalid), "stocks": codes_to_invalid_str}) upd_sql = "update dw.dim_stock t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % {"ids": codes_to_invalid_str} cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be marked invalid.") #---- mark stocks is_valid=Y if len(codes_to_valid) > 0: codes_to_valid_str = ",".join(codes_to_valid) print_log("There are %(num)s stocks will be marked valid. %(stocks)s" % {"num": len(codes_to_valid), "stocks": codes_to_valid_str}) upd_sql = "update dw.dim_stock t set is_valid = 'Y', upd_time = now() where t.id in (%(ids)s)" % {"ids": codes_to_valid_str} cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be marked valid.") #---- update stock names in dim_stock if len(codes_to_update.keys()) > 0: print_log("There are %(num)s stocks will be updated." % {"num": len(codes_to_update.keys())}) for id in codes_to_update: print_log(id) upd_sql = "update dw.dim_stock t set name = '%(name)s', upd_time = now() where t.id = '%(id)s'" % {"id": id, "name": codes_to_update[id]} cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be updated.") #---- insert stocks into dim_stock if len(codes.keys()) > 0: values = [] print_log("There are %(num)s stocks will be inserted." % {"num": len(codes.keys())}) for b in codes: print_log(b) values.append("('%(id)s', '%(name)s', now(), 'Y')" % {"id": b, "name": codes[b]} ) values_str = ",".join(values) ins_sql = "insert into dw.dim_stock(id, name, upd_time, is_valid) values %(values)s" % {"values": values_str} cur.execute(ins_sql) db_conn.commit() else: print_log("No new stock ids.") print_log("dw.dim_stock has been refreshed successfully.")