def do_insert(flag_queue, task_queue, res_queue, db_info): logger.info('Starting: do_insert()') insert_done_sql = db_info['sqls']['insert-done'] delete_todo_sql = db_info['sqls']['delete-todo'] done_conn = gen_db_conn(db_info['done-db']) todo_conn = gen_db_conn(db_info['todo-db']) statsd_cfg = cfg['statsd'] statsd_client = gen_statsd_client(statsd_cfg) STATSD_KEY = statsd_cfg['key-insert'] while True: try: record = task_queue.get() _do_insert(done_conn, insert_done_sql, record) execute_sql(todo_conn, delete_todo_sql, (record[0],), commit=True) res_queue.put(True) statsd_client.incr(STATSD_KEY) except Exception: res_queue.put(False) done_conn.close() done_conn = gen_db_conn(db_info['done-db']) todo_conn.close() todo_conn = gen_db_conn(db_info['todo-db']) logger.error('loop_insert requests.Exception: %r' % traceback.format_exc()) time.sleep(0.5) done_conn.close() todo_conn.close() logger.warning('@@@ Exit: do_insert()')
def insert(cls, initial=False): """ 向表中插入数据 :param initial: 是否先格式化表 :return: """ try: logger.info(f"start update {cls.table_name}'s data.") if initial: execute_sql(cls.splice_truncate_sql(cls.table_name)) df = cls.make_data() if df is None: return logger.info(f"find {cls.table_name}'s {df.shape[0]} records") if df.shape[0] == 0: return sql = cls.splice_insert_sql(cls.table_name, df) execute_sql(sql) except OperationalError as e: logger.error(f"Due to exception: {e}, this task failed!") else: logger.info(f'{cls.table_name} insert {df.shape[0]} records')
def process_msg(self, body, conn, sql): params = body.get('params') # task_uuid = params.get('external_id') client_id = params.get('additional_info').get('client_id') thunder_hash = params.get('thunder_hash') digest = params.get('digest') # URL url = params.get('url') url_loc = url.get('location') url_hash = url.get('hash') # Seed seed_file = params.get('seed_file', {}) seed_hash = seed_file.get('hash', '') swift_path = seed_file.get('path', '') algorithm = params.get('digest_algorithm') mime_type = params.get('mime_type') file_name = params.get('file_name') file_size = params.get('file_size') digest = url_hash if url_hash else seed_hash record = (cfg['custom-type'], client_id, thunder_hash, url_loc, digest, algorithm, mime_type, file_name, file_size, swift_path) try: execute_sql(conn, sql, record, commit=True) except mdb.IntegrityError: pass
def do_insert(flag_queue, task_queue, res_queue, db_info): logger.info('Starting: do_insert()') insert_done_sql = db_info['sqls']['insert-done'] delete_todo_sql = db_info['sqls']['delete-todo'] done_conn = gen_db_conn(db_info['done-db']) todo_conn = gen_db_conn(db_info['todo-db']) statsd_cfg = cfg['statsd'] statsd_client = gen_statsd_client(statsd_cfg) STATSD_KEY = statsd_cfg['key-insert'] while True: try: record = task_queue.get() _do_insert(done_conn, insert_done_sql, record) execute_sql(todo_conn, delete_todo_sql, (record[0], ), commit=True) res_queue.put(True) statsd_client.incr(STATSD_KEY) except Exception: res_queue.put(False) done_conn.close() done_conn = gen_db_conn(db_info['done-db']) todo_conn.close() todo_conn = gen_db_conn(db_info['todo-db']) logger.error('loop_insert requests.Exception: %r' % traceback.format_exc()) time.sleep(0.5) done_conn.close() todo_conn.close() logger.warning('@@@ Exit: do_insert()')
def _do_insert(done_conn, insert_done_sql, record): rid, thunder_hash, digest = record[0], record[3], record[5] try: logger.info('inserting mysql %d' % rid) execute_sql(done_conn, insert_done_sql, record[1:-1], commit=True) logger.info('inserting vddb %d' % rid) resp, logs = insert_vddb_tmp(cfg['vddb-async-url'], [thunder_hash, digest]) if not resp or resp.status_code != 200: raise ValueError('Insert result management Failed! <%r>' % (logs,)) logger.info('inserted mysql %d' % rid) except mdb.IntegrityError: pass except Exception: logger.error('_do_insert requests.Exception: %r' % traceback.format_exc()) time.sleep(0.5)
def _id_df(cls): """ 读取common_id_attribution表中的数据,并转成dataframe格式 :return: """ sql = "select * from common_id_attribution" data = execute_sql(sql, dict_cursor=True) if len(data) <= 1: logger.info( f'common_id_attribution table is empty, initial data of it.') CommonId.insert(initial=True) logger.info(f'common_id_attribution table initial data success.') df = pd.DataFrame(data) df.drop(['id', 'created_time', 'updated_time'], axis=1, inplace=True) def zhi_xia_city(name): if name is None: return name if name[:2] in ('北京', '上海', '天津', '重庆'): return name[:2] + '市' return name df['full_city'] = df['full_city'].apply(zhi_xia_city) return df
def _do_insert(done_conn, insert_done_sql, record): rid, thunder_hash, digest = record[0], record[3], record[5] try: logger.info('inserting mysql %d' % rid) execute_sql(done_conn, insert_done_sql, record[1:-1], commit=True) logger.info('inserting vddb %d' % rid) resp, logs = insert_vddb_tmp(cfg['vddb-async-url'], [thunder_hash, digest]) if not resp or resp.status_code != 200: raise ValueError('Insert result management Failed! <%r>' % (logs, )) logger.info('inserted mysql %d' % rid) except mdb.IntegrityError: pass except Exception: logger.error('_do_insert requests.Exception: %r' % traceback.format_exc()) time.sleep(0.5)
def _exist_numbers(cls): """ 读取已存在common_mobile_attribution表中的所有手机号码(前7位),用于新插入数据的去重 :return: """ number_sql = "select number from common_mobile_attribution" number_data = execute_sql(number_sql, dict_cursor=True) number_set = set(pd.DataFrame(number_data)['number'].to_list()) return number_set
def _last_date(cls): """ 获取common_date表中最近的一条记录的date :return: date格式的日期 """ sql = "select date from common_date order by date desc limit 1" data = execute_sql(sql, dict_cursor=True) if len(data) == 0: return return data[0].get('date')
def _last_time(cls): """ 获取common_mobile_attribution表中最新一条数据的插入时间 :return: datetime """ last_date_sql = "select max(created_time) as last_time from common_mobile_attribution" last_date_data = execute_sql(last_date_sql, dict_cursor=True) if len(last_date_data) == 0: return last_time = last_date_data[0].get('last_time') if last_time is None: return return last_time.date()
def _init_data(cls): """ 从 my_user_profile.DateClass 中拉取历史common_date数据 :return: """ sql = "select distinct date as date_str, class from DateClass" data = execute_sql(sql, db='my_user_profile', dict_cursor=True) df = pd.DataFrame(data) df.drop_duplicates(subset=['date_str'], inplace=True) df['date'] = df['date_str'].apply( lambda x: datetime.date(int(x[:4]), int(x[4:6]), int(x[6:]))) df['weekday'] = df['date'].apply(lambda x: x.weekday()) return df
def process_msg(self, body, db_conn, sql): if isinstance(body, (str, unicode)): body = json.loads(body) params = body.get('params') task_uuid = params.get('external_id') client_id = params.get('additional_info').get('client_id') thunder_hash = params.get('thunder_hash') digest = params.get('digest') # URL url = params.get('url') url_loc = url.get('location') url_hash = url.get('hash') # Seed seed_file = params.get('seed_file', {}) seed_hash = seed_file.get('hash', '') swift_path = seed_file.get('path', '') algorithm = params.get('digest_algorithm') mime_type = params.get('mime_type') file_name = params.get('file_name') file_size = params.get('file_size') if cfg['vddb-async']['should-insert']: logger.info('Insert to vddb: %s' % task_uuid) resp, logs = insert_vddb_tmp(cfg['vddb-async']['url'], [thunder_hash, url_hash, seed_hash]) if not resp or resp.status_code != 200: raise ValueError('Insert result management Failed! <%r>' % (logs, )) record = (cfg['custom-type'], client_id, thunder_hash, url_loc, digest, algorithm, mime_type, file_name, file_size, swift_path) try: execute_sql(db_conn, sql, record) except mdb.IntegrityError: pass
def process_msg(self, body, db_conn, sql): if isinstance(body, (str, unicode)): body = json.loads(body) params = body.get('params') task_uuid = params.get('external_id') client_id = params.get('additional_info').get('client_id') thunder_hash = params.get('thunder_hash') digest = params.get('digest') # URL url = params.get('url') url_loc = url.get('location') url_hash = url.get('hash') # Seed seed_file = params.get('seed_file', {}) seed_hash = seed_file.get('hash', '') swift_path = seed_file.get('path', '') algorithm = params.get('digest_algorithm') mime_type = params.get('mime_type') file_name = params.get('file_name') file_size = params.get('file_size') if cfg['vddb-async']['should-insert']: logger.info('Insert to vddb: %s' % task_uuid) resp, logs = insert_vddb_tmp(cfg['vddb-async']['url'], [thunder_hash, url_hash, seed_hash]) if not resp or resp.status_code != 200: raise ValueError('Insert result management Failed! <%r>' % (logs,)) record = (cfg['custom-type'], client_id, thunder_hash, url_loc, digest, algorithm, mime_type, file_name, file_size, swift_path) try: execute_sql(db_conn, sql, record) except mdb.IntegrityError: pass
def loop_insert(flag_queue, task_queue, res_queue, db_info): logger.info('Starting: loop_insert() => %s' % (cfg['vddb-async-url'])) select_todo_sql = db_info['sqls']['select-todo'] select_limit = db_info['select-limit'] todo_conn = gen_db_conn(db_info['todo-db']) while True: try: while not res_queue.empty(): ok = res_queue.get() rows = execute_sql(todo_conn, select_todo_sql, (select_limit, ), fetch=True) for record in rows: logger.info('Deleted mysql %d' % record[0]) task_queue.put(record) time.sleep(1) results = [] results_ok = 0 rows_len = len(rows) for i in range(rows_len): try: ok = res_queue.get(timeout=5) except Queue.Empty(): ok = False if ok: results_ok += 1 results.append(ok) if rows_len == 0: time.sleep(db_info['select-interval']) logger.info('One turn: (%d/%d), %r' % (results_ok, rows_len, results)) except KeyboardInterrupt: break except Exception: todo_conn.close() todo_conn = gen_db_conn(db_info['todo-db']) logger.error('loop_insert Exception: %r' % traceback.format_exc()) time.sleep(0.5) todo_conn.close() logger.warning('@@@ Exit: loop_insert()')
def _update_user_mobile(cls): """ 从 my_v2.credit_user表中读取最新一天的用户手机号(前7位)数据,去重过滤后,标注归属地信息 :return: dateframe格式的数据 """ end = datetime.datetime.now().date() start = end - datetime.timedelta(days=1) sql = f"select distinct mobile as number from credit_user where created_time between '{start}' and '{end}'" data = execute_sql(sql, db='my_v2', dict_cursor=True) if len(data) == 0: logger.info( f'Not add any new user between {start} and {end} in credit_user table' ) return df = pd.DataFrame(data) df = df[df['number'].apply(is_valid_phone)] df['number'] = df['number'].apply(lambda x: x[:7]) df.drop_duplicates(inplace=True) df.dropna(inplace=True) exist_numbers = cls._exist_numbers() df = df[df['number'].apply(lambda x: x not in exist_numbers)] logger.info( f"New unique numbers: {df.values.tolist()} between {start} and {end}" ) if df.shape[0] == 0: logger.info(f"No valid update data for {cls.table_name} table!") return df['phone_info'] = df['number'].apply(cls._phone_query) pdf = df['phone_info'].str.split('|', expand=True) df['city_code'], df['zip_code'], df['short_province'] = pdf[0], pdf[ 1], pdf[2] df['short_city'], df['phone_type'] = pdf[3], pdf[4] df.drop(columns=['phone_info'], axis=1, inplace=True) return df
def _init_data(cls): """ 从 my_user_profile.PhoneCity表中读取历史数据,转为dataframe格式 :return: """ sql = "select distinct code, province, city, corporation, area_code, zip_code from PhoneCity" data = execute_sql(sql, db='my_user_profile', dict_cursor=True) df = pd.DataFrame(data) df.rename(columns={ 'code': 'number', 'corporation': 'phone_type', 'area_code': 'city_code' }, inplace=True) df.rename(columns={ 'province': 'short_province', 'city': 'short_city' }, inplace=True) df = df[df['number'].apply(is_valid_phone_7)] df.replace('', np.nan, inplace=True) return df
def existed_id(cls): sql = "select * from common_id_attribution" data = execute_sql(sql, dict_cursor=True) return data