def trans_erpsku_info_from_sql_2_redis(): """ 从公司的服务器中加载erpsku信息, 进行处理得到erpsku,asin,sellersku后, 将信息存储到redis中其中 key 以 erpsku_info_时间(日期_小时) db=0 :return:None """ # 加载erpsku信息 engine = create_engine("mysql+pymysql://{}:{}@{}:{}/{}?charset={}".format( 'mrp_read', 'mrpread', '47.106.127.183', 3306, 'mrp_py', 'utf8')) conn = engine.connect() select_erpsku_sql = 'SELECT 标识,erp_sku FROM gross_require' erpsku_info = pd.read_sql(select_erpsku_sql, conn) conn.close() # 将数据库信息处理后生成erpsku,asin,sellersku """ 标识: 'Socialme美国$SMU - JHX - 8 WMb0P - JY18828 - 02 FBA$X0025AUPJP - B07Q6W6LXC @ JY18828 - 02' 由station_name$sellersku$fnsku-asin@erpsku """ erpsku_info['seller_sku'] = erpsku_info['标识'].apply( lambda x: x.split('$')[1]) erpsku_info['asin'] = erpsku_info['标识'].apply( lambda x: x.split('$')[2][11:21]) erpsku_info.rename(columns={'erp_sku': 'erpsku'}, inplace=True) erpsku_info = erpsku_info[['erpsku', 'asin', 'seller_sku']] # 将erpsku_info存储到redis中 now_datetime = datetime.now().strftime('%Y-%m-%d_%H') erpsku_redis_key = 'erpsku_info_{}'.format(now_datetime) conn_redis = public_function.Redis_Store(decode_responses=False, db=0) conn_redis.redis_upload_df(erpsku_redis_key, erpsku_info) conn_redis.close()
def load_station_campaign_report(station_name): """ 加载站点广告报表 Args: station_name: str 站点名 Returns:pd.DataFrame 返回的站点报表数据 """ redis_conn = public_function.Redis_Store(db=2) five_files_redis_sign = commonly_params.five_files_redis_sign all_redis_keys = redis_conn.keys() station_report_key = [ key for key in all_redis_keys if (five_files_redis_sign in key) & (station_name.upper() in key) & ('CP' in key) ] if station_report_key != 1: raise ValueError( f'{station_name}_CP have multiple redis key or None.please check redis database' ) station_report_key = station_report_key[0] station_report_pkl_path = redis_conn.get(station_report_key) redis_conn.close() return process_files.read_pickle_2_df(station_report_pkl_path)
def find_new_station(date_range=1) -> list: """ 找到需要更新站点列表 Args: date_range (int) default 1: 全部站点中需要 Returns:list 需要计算的站点列表 """ # 初始化redis try: redis_conn = public_function.Redis_Store(db=2) except: redis_conn.close() raise ConnectionError('Can not connect redis.') # 获取站点中五表时间 five_files_redis_sign = commonly_params.five_files_redis_sign all_redis_keys = redis_conn.keys() redis_conn.close() five_files_redis_keys = [ key for key in all_redis_keys if five_files_redis_sign in key ] # 每个redis键的最后14位为报表上传时间,站点信息在'FIVE_FILES_KEYS_SAVE:02_AU_AC_20200718105127' # redis键由:20位标识符('FIVE_FILES_KEYS_SAVE')+站点+2位报表名称+14位时间标识符组成 # 从今日向前取date_range天数的站点 now_date = datetime.today().date() start_date = now_date - timedelta(days=date_range) return [ key[21:-18] for key in five_files_redis_keys if (datetime.strptime(key[-14:], '%Y%m%d%H%M%S').date() >= start_date) & (datetime.strptime(key[-14:], '%Y%m%d%H%M%S').date() < now_date) ]
def load_sku_erpsku(): """ 加载sku与erpsku以及asin: 需要考虑到解码: 由于erpsku的信息是编码存储到redis中,而erpsku键是解码存储到redis中, 于是先解码取出erpsku,然后在编码取出erpsku信息 Returns:pd.DataFrame """ # 1.从redis中获得erpsku信息,若redis中没有,则将信息从数据库中加载到redis中 # erpsku信息存储在redis中的键是以 erpsku_info_日期_小时 conn_redis = public_function.Redis_Store(decode_responses=True, db=0) redis_db0_keys = conn_redis.keys() erpsku_redis_key_sign = commonly_params.erpsku_redis_sign erpsku_exist_key = [ key for key in redis_db0_keys if erpsku_redis_key_sign in key ][0] conn_redis.close() conn_redis = public_function.Redis_Store(decode_responses=False, db=0) erpsku_info = conn_redis.redis_download_df(erpsku_exist_key) conn_redis.close() return erpsku_info
def load_station_report(station_name, report_type='cp'): """ 加载站点广告报表 Args: station_name: str 站点名 report_type: str 报表类型 Returns:pd.DataFrame 返回的站点报表数据 """ redis_conn = public_function.Redis_Store(db=2) five_files_redis_sign = commonly_params.five_files_redis_sign all_redis_keys = redis_conn.keys() station_report_key = [ key for key in all_redis_keys if (five_files_redis_sign in key) & (station_name.upper() in key) & (report_type.upper() == key[-17:-15].upper()) ] if len(station_report_key) > 1: # print(f'{station_name}_{report_type} have multiple redis key') # 选择时间最大的键 station_report_key_time_dict = { key: key[-14:] for key in station_report_key } station_report_key = [ key for key, time in station_report_key_time_dict.items() if time == max(station_report_key_time_dict.values()) ][0] elif len(station_report_key) == 1: station_report_key = station_report_key[0] else: raise ValueError( f'{station_name}_{report_type} have none redis key.') station_report_pkl_path = redis_conn.get(station_report_key) redis_conn.close() return process_files.read_pickle_2_df(station_report_pkl_path)
def zipped_folders_2_pickle(zipped_files: 'path', unzipped_file_save_folder=None, folder_save_pkl=False, delete_file=True) -> None: """ 将存储压缩文件夹的压缩文件中的文件序列化存储为pickle,同时将路径存在在redis中 Parameters: zipped_files:path object 销售压缩文件夹的压缩文件 folder_save_pkl:path object,or False,default False 存储pickle文件的文件夹,默认解压到当前文件夹 delete_file:bool,default True 是否删除压缩后的文件,默认为删除文件 Returns: None """ if not os.path.exists(zipped_files): raise FileNotFoundError('{} cant found.'.format(zipped_files)) # 若没有指定解压文件存储的文件夹,则将压缩文件压缩到当前文件夹 if unzipped_file_save_folder is None: unzipped_file_save_folder = os.path.dirname(zipped_files) if not os.path.isdir(unzipped_file_save_folder): raise ValueError( '{} is not a folder.'.format(unzipped_file_save_folder)) if not os.path.exists(unzipped_file_save_folder): os.mkdir(unzipped_file_save_folder) # 若没有指定文件夹,则解压pkl文件指定为文件夹 if not folder_save_pkl: folder_save_pkl = unzipped_file_save_folder if not os.path.isdir(folder_save_pkl): raise ValueError('{} is not a folder.'.format(folder_save_pkl)) if not os.path.exists(folder_save_pkl): os.mkdir(folder_save_pkl) # 1. 解压文件 process_files.unzip_folder(zipped_files, save_folder=unzipped_file_save_folder) # 2. 将文件pkl化 # 2.1 获得全部文件完整路径 all_files = [] station_name = os.path.splitext(os.path.basename(zipped_files))[0] station_name = station_name.upper() station_folder = os.path.join(unzipped_file_save_folder, station_name) for root, _, files in os.walk(station_folder): for file in files: file_path = os.path.join(root, file) all_files.append(file_path) # 2.2 pickle化(这里文件的sheet名应该规范化,防止其他语言) # 保存pickle到当前文件夹下面 # 规范输出后的pkl文件名(账号_站点_数据类型) def standardize_file_pickle_name(file_path): """ 规范文件输出后pickle文件命名(账号_站点_数据类型_时间) :param file_path: :return: """ if not os.path.isfile(file_path): raise FileExistsError('{} not a file.') if not os.path.exists(file_path): raise FileExistsError('{} not exists.') station_name = os.path.basename(os.path.dirname(file_path)) station_name = station_name.upper() account = station_name[:-3] site = station_name[-2:] # 关键词判断无法判断all order表 file_type = [ type for type, keyword in public_function.FILE_RECOGNIZE.items() if keyword in file_path.lower() ] if len(file_type) == 1: file_type = file_type[0] else: if os.path.splitext(os.path.basename(file_path))[0].isdigit(): file_type = 'ao' else: file_type = 'None' return account + '_' + site + '_' + file_type + '.pkl' # 将保存为pickle同时将文件路径保存在redis中 redis_store = public_function.Redis_Store() keys = redis_store.keys() # 压缩文件的时间 last_time_timestamp = os.path.getctime(zipped_files) last_time = datetime.fromtimestamp(last_time_timestamp).strftime( '%Y%m%d%H%M%S') sign_key = 'FIVE_FILES_KEYS_SAVE' for file_path in all_files: file_pickle_path = os.path.join( folder_save_pkl, standardize_file_pickle_name(file_path)) process_files.write_file_2_pickle(file_path, pkl_path=file_pickle_path) # 保存的键为站号_站点_文件类型_时间 # redis保存的键为five_save_keys+站点+日期 five_save_keys为项目标志 file_redis_key = sign_key + ':' + standardize_file_pickle_name( file_path).replace('.pkl', '') + '_' + last_time file_redis_key = file_redis_key.upper() redis_store.set(file_redis_key, file_pickle_path) # 删除该站点之前存储的键 [ redis_store.delete(key) for key in keys if (sign_key in key) and ( station_name in key) and (last_time not in key) ] if delete_file: os.remove(file_path)
def save_five_reports(): """ 将远程请求的5表同步保留到redis中与 1.首先将站点报表类型:路径 键值对存储到redis 2. 将文件压缩至到5表压缩的文件夹中 :return: """ def file_redis_expire_time(file_time, expireDay=3): """ 文件存在redis的过期时间 :param file_time: :param file_expire_time: :return: """ if file_time is None: return if (not isinstance(file_time, datetime)): return expireDate = file_time + timedelta(days=expireDay) expireDatetime = datetime(expireDate.year, expireDate.month, expireDate.day) return int((expireDatetime - datetime.now()).total_seconds()) if not os.path.isdir(REMOTE_SAVE_FOLDER): raise FileNotFoundError(f'{REMOTE_SAVE_FOLDER}.') # 1.存储到redis中 # 获取文件中站点存在的文件类型 # 路径中的类型关键词 reportTypeSignDict = { 'bulk': 'cp', 'business': 'br', 'search': 'st', 'active': 'ac', 'orders': 'ao' } stationTypeDict = {} # 删除两天以前的报表 # 遍历远程保存文件夹,获取站点类型 threeDayBeforeStr = (datetime.now() - timedelta(days=3)).strftime('%Y-%m-%d %H:%M:%S') for root, dirs, files in os.walk(REMOTE_SAVE_FOLDER): stationName = os.path.basename(root) # 只压缩今天请求的站点 if stationName.lower() not in todayQueryStations: continue filesType = {} # # # todo 测试一个站点 # if 'smandy_it' not in root: # continue for file in files: fileFullpath = os.path.join(root, file) if process_files.file_create_time( fileFullpath) < threeDayBeforeStr: if os.path.exists(fileFullpath): os.remove(fileFullpath) continue for signWord, type in reportTypeSignDict.items(): if signWord in file.lower(): fileTypePath = filesType.get(type, []) fileTypePath.append(os.path.join(stationName, file)) filesType[type] = fileTypePath break # ao表需要重新处理 if os.path.splitext(file)[0].isdigit(): fileTypeAo = filesType.get('ao', []) fileTypeAo.append(os.path.join(stationName, file)) filesType['ao'] = fileTypeAo if filesType: # 各个类型中有可能存在几个表,此时保留最大的报表,同时删除其他报表 for type, path in filesType.items(): if isinstance(path, list): if len(path) == 1: filesType[type] = os.path.join(REMOTE_SAVE_FOLDER, path[0]) else: allFileFullPath = [ os.path.join(REMOTE_SAVE_FOLDER, onePath) for onePath in path ] newestPath = process_files.newest_file(allFileFullPath) filesType[type] = newestPath if newestPath is not None: try: [ os.remove(path) for path in allFileFullPath if (path != newestPath) and ( os.path.exists(path)) ] except Exception as e: print(e) continue stationTypeDict[stationName] = filesType # 判断压缩文件是否存在 stationsZipFolderPath = r"F:\five_reports_zipped" todayWeekDay = datetime.now().weekday() for station, stationNewFileTypeDict in stationTypeDict.items(): stationZipFile = os.path.join(stationsZipFolderPath, station.lower() + '.zip') if not os.path.isfile(stationZipFile): with zipfile.ZipFile(stationZipFile, 'w') as file: pass # zip文件中已经存在的文件类型 stationExistFile = myZip.zipFileList(stationZipFile) stationExistFileTypeDict = process_files.file_type(stationExistFile) # 新加入的文件类型 deleteType = set(stationNewFileTypeDict.keys()) & set( stationExistFileTypeDict.keys()) try: [ myZip.zip_delete(stationZipFile, path) for key in deleteType for path in stationExistFileTypeDict.get(key) ] except Exception as e: print(e) print(station) continue # 将新的写入 with zipfile.ZipFile(stationZipFile, 'a') as wfile: for _, file in stationNewFileTypeDict.items(): if file is None: continue if os.path.exists(file): station = station.strip().replace('-', '_').replace( ' ', '_').lower() targetPath = os.path.join(station, os.path.basename(file).lower()) wfile.write(file, targetPath) # 删除压缩文件中两天前的文件 fileCreateTime = { file: myZip.file_create_time_in_zip(stationZipFile, file) for file in myZip.zipFileList(stationZipFile) if not file.endswith('/') } try: [ myZip.zip_delete(stationZipFile, file) for file, fileTime in fileCreateTime.items() if isinstance(fileTime, datetime) and (datetime.now().date() - fileTime.date()).days > 2 ] except Exception as e: print(f'处理{station}的压缩文件有问题') print(station) print(e) continue # 删掉redis中本站点的键,然后再添加 stationsFileTypeRedisSignKey = 'api_request_files' _connRedis = public_function.Redis_Store(db=1) # [_connRedis.delete(key) for key in _connRedis.keys() if (key.startswith(stationsFileTypeRedisSignKey)) and (station.lower() == key[len(stationsFileTypeRedisSignKey) + 1:len(key) - 3])] stationExistFile = myZip.zipFileList(stationZipFile) stationExistFileTypeDict = process_files.file_type(stationExistFile) for type, files in stationExistFileTypeDict.items(): for file in files: fileTime = myZip.file_create_time_in_zip(stationZipFile, file) redisExpireTime = file_redis_expire_time(fileTime, expireDay=3) if redisExpireTime is not None: _connRedis.set( f'{stationsFileTypeRedisSignKey}:{station.lower()}_{type}', todayWeekDay, ex=int(redisExpireTime)) # 添加日志 stationRequestTypeResult = { station: list(stationTypeValue.keys()) for station, stationTypeValue in stationTypeDict.items() } resultTypeDict = {'ac': [], 'br': [], 'ao': [], 'st': [], 'cp': []} allTypeMsg = '' for type in resultTypeDict.keys(): for station, stationType in stationRequestTypeResult.items(): if type in stationType: resultTypeDict[type].append(station) typeLen = len(resultTypeDict[type]) typeMsg = f'{type}表一共请求到{typeLen}个,请求到的比例为:{round(typeLen / len(todayQueryStations) * 100, 2)}%\n' allTypeMsg += typeMsg msg = f'{datetime.now().date()}:请求{len(todayQueryStations)}个站点。\n请求详请如下:\n{allTypeMsg}' print(msg) # 没有请求到的站点信息 stationMissedDict = { station: list(set(todayQueryStations) - set(stationList)) for station, stationList in resultTypeDict.items() } stationMissedDictMsg = json.dumps(stationMissedDict) allRequestStationMsg = json.dumps(list(todayQueryStations)) logPath = r"F:\five_reports_zipped\request_stations_log.txt" with open(logPath, 'w+') as f: f.write(f'{msg}\n') f.write(f'今日请求站点列表:{allRequestStationMsg}\n\n') f.write(f'站点缺失详情:{stationMissedDictMsg}\n')
def db_upload_st_file(st_info_folder=r'E:\AD_WEB\file_dir\st_info'): """ 主函数,将st_info文件夹中的更新文件存储到数据库中 :param st_info_folder: str st_info文件夹路径 :return: None """ # 1.从redis中获得erpsku信息,若redis中没有,则将信息从数据库中加载到redis中 # erpsku信息存储在redis中的键是以 erpsku_info_日期_小时 conn_redis = public_function.Redis_Store(decode_responses=True, db=0) redis_db0_keys = conn_redis.keys() erpsku_redis_key_sign = 'erpsku_info' now_datetime = datetime.now() now_date = now_datetime.strftime('%Y-%m-%d') now_hour = now_datetime.hour refresh = 7 if now_hour >= refresh: erpsku_today_key = [ key for key in redis_db0_keys if (erpsku_redis_key_sign in key) and (now_date in key) and ( int(key.split('_')[-1]) >= refresh) ] if not erpsku_today_key: # 更新redis中erpsku键 [ conn_redis.delete(key) for key in redis_db0_keys if erpsku_redis_key_sign in key ] trans_erpsku_info_from_sql_2_redis() erpsku_exist_key = [ key for key in redis_db0_keys if erpsku_redis_key_sign in key ][0] conn_redis.close() conn_redis = public_function.Redis_Store(decode_responses=False, db=0) erpsku_info = conn_redis.redis_download_df(erpsku_exist_key) conn_redis.close() # 获得更新的st报表 # 先初始化st old_files_list = [ os.path.join(st_info_folder, file) for file in os.listdir(st_info_folder) if 'ST' in file ] old_files_modify_time = { file: os.path.getmtime(file) for file in old_files_list } while 1: new_files_list = [ os.path.join(st_info_folder, file) for file in os.listdir(st_info_folder) if 'ST' in file ] new_files_modify_time = { file: os.path.getmtime(file) for file in new_files_list } process_st_files = [ file for file, file_time in new_files_modify_time.items() if file_time != old_files_modify_time.get(file, None) ] if process_st_files: st_files = [file for file in process_st_files] for file in st_files: upload_file(file, erpsku_info) else: time.sleep(30) print('暂无st表更新,休息60秒.') old_files_modify_time = copy.deepcopy(new_files_modify_time) time.sleep(30) if datetime.now().hour in set(range(0, 7)): # 将erpsku搜索词数据库中的共享关键词表和同语言的共享关键词表同步到新的数据库中,并将搜索词分裂成几列 # 来便于提高用户的搜索速度,同时应该添加数据更新时间 erpsku_restkws_sql = "SELECT * FROM erpsku_restkws_add_columns" erpsku_restkws_ori_data = conn_db.read_table( erpsku_restkws_sql, db='server_camp_report') erpsku_restkws_same_langs_sql = "SELECT * FROM erpsku_restkws_add_columns_filter_langs" erpsku_restkws_same_langs_data = conn_db.read_table( erpsku_restkws_same_langs_sql, db='server_camp_report') def detect_list_lang(words: list): """ 判断文字是什么语言: 若语言的开头和结尾是以英文或是数字开头,这可以判断为是英语 若语言的开头和结尾不是以英文或是数字开头,则需要判断文字是 最接近如下 ['it', 'en', 'de', 'fr', 'es', 'ja', 'zh'] Args: words:list 需要检测的一组文字 Returns:list 文字的检测结果 """ list_lang = [ 'en' if len(word) < 2 else 'en' if (word[0] in string.printable) and (word[-1] in string.printable) else public_function.detect_lang(word) for word in words ] lang_dict = { 'en': 'english', 'ja': 'japanese', 'zh': 'chinese', 'it': 'italian', 'de': 'german', 'fr': 'french', 'es': 'spanish' } return [lang_dict[lang] for lang in list_lang] def add_columns(df): rest_kw_langs = detect_list_lang(list(df['rest_kw'])) df['rest_kws_list'] = list( map(public_function.split_sentence, list(df['rest_kw']), rest_kw_langs)) for i in range(10): df[f'keyword_{i + 1}'] = [ word_list[i] if len(word_list) > i else '' for word_list in df['rest_kws_list'].values ] now_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') df['updatetime'] = now_datetime del df['rest_kws_list'] def to_sql_replace(df, table, db='team_station'): # 执行sql engine = create_engine( "mysql+pymysql://{}:{}@{}:{}/{}?charset={}".format( 'user', '', 'wuhan.yibai-it.com', 33061, db, 'utf8')) conn = engine.connect() try: # 将数据写入到dataframe中 index_columns = [ 'account_site', 'seller_sku', 'erp_sku', 'asin', 'keyword_1', 'keyword_2', 'keyword_3', 'keyword_4', 'keyword_5', 'keyword_6', 'keyword_7', 'keyword_8', 'keyword_9', 'keyword_10' ] columns_type = VARCHAR(length=255) index_column_type = { column: columns_type for column in index_columns } df.to_sql(table, conn, if_exists='replace', index=False, dtype=index_column_type) # 建立索引 create_index_sql = "ALTER TABLE `%s` ADD INDEX 站点 (`%s`)," \ "ADD INDEX seller_sku (`%s`),ADD INDEX erk_sku (`%s`),ADD INDEX asin (`%s`),ADD INDEX keyword_1 (`%s`)," \ "ADD INDEX keyword_2 (`%s`),ADD INDEX keyword_3 (`%s`),ADD INDEX keyword_4 (`%s`),ADD INDEX keyword_5 (`%s`)," \ "ADD INDEX keyword_6 (`%s`),ADD INDEX keyword_7(`%s`),ADD INDEX keyword_8 (`%s`),ADD INDEX keyword_9 (`%s`)," \ "ADD INDEX keyword_10 (`%s`);" % (table, index_columns[0], index_columns[1], index_columns[2], index_columns[3], index_columns[4], index_columns[5], index_columns[6], index_columns[7], index_columns[8], index_columns[9], index_columns[10], index_columns[11], index_columns[12], index_columns[13]) engine.execute(create_index_sql) except Exception as e: print(e) finally: conn.close() engine.dispose() add_columns(erpsku_restkws_ori_data) add_columns(erpsku_restkws_same_langs_data) erpsku_restkws_for_search_table = "erpsku_restkws_add_columns_for_search" to_sql_replace(erpsku_restkws_ori_data, erpsku_restkws_for_search_table, db='server_camp_report') del erpsku_restkws_ori_data gc.collect() erpsku_restkws_same_langs_for_search_table = "erpsku_restkws_add_columns_filter_langs_for_search" to_sql_replace(erpsku_restkws_same_langs_data, erpsku_restkws_same_langs_for_search_table, db='server_camp_report') del erpsku_restkws_same_langs_data gc.collect() restart_hour = 9 reset_time = (restart_hour - datetime.now().hour) * 3600 time.sleep(reset_time) print(f'早上{restart_hour}再开始.') # 更新erpsku_info conn_redis = public_function.Redis_Store(decode_responses=True, db=0) redis_db0_keys = conn_redis.keys() erpsku_today_key = [ key for key in redis_db0_keys if (erpsku_redis_key_sign in key) and (now_date in key) ] if not erpsku_today_key: # 更新redis中erpsku键 [ conn_redis.delete(key) for key in redis_db0_keys if erpsku_redis_key_sign in key ] trans_erpsku_info_from_sql_2_redis() erpsku_exist_key = [ key for key in redis_db0_keys if erpsku_redis_key_sign in key ][0] conn_redis.close() conn_redis = public_function.Redis_Store( decode_responses=False, db=0) erpsku_info = conn_redis.redis_download_df(erpsku_exist_key) conn_redis.close()