def choose_perf_collection(begin_datetime, end_datetime, prefix): begin_time = datetime_timestamp(begin_datetime) end_time = datetime_timestamp(end_datetime) ## 1. 动态确定查询分表 # 注意条件中begin_time和end_time关系颠倒 col_condition = { 'type': prefix, 'endTime': { '$gte': begin_time }, 'beginTime': { '$lt': end_time } } perf_mgo = database('PERF_MGO').get_db() sub_collections = list( perf_mgo.get_collection('t_db_sub_collections').find(col_condition)) # 没有查到数据, 可能分表失败,默认从最新的表中提取数据 if not sub_collections: sub_collections = list( perf_mgo.get_collection('t_db_sub_collections').find( { 'type': prefix }, limit=1).sort('endTime', pymongo.DESCENDING)) collections = [] for item in sub_collections: collections.append(item['collectionName']) return collections
def _query_flow_day_group(imsi_list, begin_datetime, end_datetime, con=None): flowdata = {} if not con: con = database('OSS_MGO').get_db() # 处理时间跨月的情况 while begin_datetime < end_datetime: begin_time = datetime_timestamp(begin_datetime) end_time = datetime_timestamp(end_datetime) pipeline = [{ '$match': { 'countTime': { '$gte': begin_time, '$lt': end_time }, 'imsi': { '$in': imsi_list } } }, { '$group': { '_id': '$imsi', 'total': { '$sum': '$total' }, 'userTotal': { '$sum': '$userTotal' }, 'cardTotal': { '$sum': '$cardTotal' } } }, { '$project': { '_id': 1, 'flow': { '$add': ['$userTotal', '$total', '$cardTotal'] } } }] col = 't_terminal_flow_count_day_{}'.format( format_datetime(begin_datetime, '%Y%m01')) for doc in con[col].aggregate(pipeline, allowDiskUse=True): if doc['_id'] in flowdata.keys(): flowdata[doc['_id']] += doc['flow'] else: flowdata.update({doc['_id']: doc['flow']}) if begin_datetime.month + 1 > 12: begin_datetime = datetime.datetime(year=begin_datetime.year + 1, month=1, day=1) else: begin_datetime = datetime.datetime(year=begin_datetime.year, month=begin_datetime.month + 1, day=1) return flowdata
def generate_timestamp_datetime(): begin_datetime = mkdatetime('2017-01-01 08:00:00') end_datetime = mkdatetime('2018-01-01 08:00:00') tmp = [['timestamp', 'date']] while begin_datetime < end_datetime: ts = datetime_timestamp(begin_datetime) tmp.append([ts, begin_datetime]) begin_datetime += datetime.timedelta(1) pyexcel.save_as(array=tmp, dest_file_name='2017_datetime_timestamp.csv')
def _query_flow_hour_group(imsi_list, begin_datetime, end_datetime, con=None): begin_time = datetime_timestamp(begin_datetime) end_time = datetime_timestamp(end_datetime) pipeline = [{ '$match': { 'countTime': { '$gte': begin_time, '$lt': end_time }, 'imsi': { '$in': imsi_list } } }, { '$group': { '_id': '$imsi', 'total': { '$sum': '$total' }, 'userTotal': { '$sum': '$userTotal' }, 'cardTotal': { '$sum': '$cardTotal' } } }, { '$project': { '_id': 1, 'flow': { '$add': ['$userTotal', '$total', '$cardTotal'] } } }] if not con: con = database('OSS_MGO').get_db() col = 't_terminal_flow_count_hour_{}'.format( format_datetime(begin_datetime, '%Y%m01')) flowdata = {} for doc in con[col].aggregate(pipeline, allowDiskUse=True): flowdata.update({doc['_id']: doc['flow']}) return flowdata
def convertToUnixtime(df, datetime): datetime = df[datetime] unixtime = utils.datetime_timestamp(datetime) return unixtime
def fetch_day_flow(begin_datetime, end_datetime): begin_time = datetime_timestamp(begin_datetime) end_time = datetime_timestamp(end_datetime) pipeline = [{ '$match': { 'createtime': { '$gte': begin_time, '$lt': end_time } } }, { '$project': { '_id': 0, 'createtime': 1, 'mcc': 1, 'lac': 1, 'plmn': 1, 'imei': 1, 'imsi': 1, 'userFlow': '$userFlower', 'cardFlow': 1, 'sysFlow': '$sysFlower' } }] mgo = database('GSVC_MGO').get_db() rdata = list( mgo.get_collection('dayChargeFlower').aggregate(pipeline, allowDiskUse=True)) keys = [ 'createtime', 'lac', 'mcc', 'plmn', 'imei', 'imsi', 'userFlow', 'cardFlow', 'sysFlow' ] imei_org = load_imei_org() mcc_country = load_mcc_country() # format data, and deal with missing data for i in range(len(rdata)): # deal with missing key missing_key = list(set(keys).difference(set(rdata[i].keys()))) for k in missing_key: if k in ['lac', 'imsi', 'plmn', 'mcc', 'imei']: rdata[i][k] = 'NaN' else: rdata[i][k] = 0 # add imei info if rdata[i]['imei'] != 'NaN' and (rdata[i]['imei'] in imei_org.keys()): rdata[i]['t_orgid'] = imei_org[rdata[i]['imei']]['t_orgid'] rdata[i]['t_type'] = imei_org[rdata[i]['imei']]['t_type'] else: rdata[i]['t_orgid'] = 'NaN' rdata[i]['t_type'] = 'NaN' # add country info if rdata[i]['mcc'] != 'NaN' and (rdata[i]['mcc'] in mcc_country.keys()): # deal GU and SP if rdata[i]['mcc'] == '310': if rdata[i]['plmn'] != 'NaN': mnc = rdata[i]['plmn'][-3:] if (mnc in ['470', '140']) and \ (rdata[i]['lac'] in ['208','171','1', '10', '23', '24', '60', '66']): rdata[i]['country'] = 'SPGU' else: rdata[i]['country'] = 'US' else: rdata[i]['country'] = 'US' else: rdata[i]['country'] = mcc_country[rdata[i]['mcc']] else: rdata[i]['country'] = 'NaN' return rdata
def fetch_day_flow_cursor(begin_datetime, end_datetime): begin_time = datetime_timestamp(begin_datetime) end_time = datetime_timestamp(end_datetime) match = {'createtime': {'$gte': begin_time, '$lt': end_time}} project = {'_id': 0} mgo = database('GSVC_MGO').get_db() cur = mgo.get_collection('dayChargeFlower').find(match, project) keys = [ 'createtime', 'lac', 'mcc', 'plmn', 'imei', 'imsi', 'userFlower', 'cardFlow', 'sysFlower' ] count = 0 tmp = [] # data to be insert imei_org = load_imei_org() mcc_country = load_mcc_country() while True: try: doc = cur.next() except StopIteration: # 最后一次插入 if tmp: insertTable(tmp, 't_terminal_flow_count_day_201701') break # 处理缺失的key missing_key = list(set(keys).difference(set(doc.keys()))) for k in missing_key: if k in ['lac', 'imsi', 'plmn', 'mcc', 'imei']: doc[k] = 'NaN' else: doc[k] = 0 # 丢弃异常流量数据 # 不在数据库做限制,是因为加上这个条件后,查询和数据抽取的性能低下 if doc['userFlower'] > 2147483648: continue # 增加imei信息 if doc['imei'] != 'NaN' and (doc['imei'] in imei_org.keys()): doc['t_orgid'] = imei_org[doc['imei']]['t_orgid'] doc['t_type'] = imei_org[doc['imei']]['t_type'] else: doc['t_orgid'] = 'NaN' doc['t_type'] = 'NaN' # 增加国家信息 if doc['mcc'] != 'NaN' and (doc['mcc'] in mcc_country.keys()): # deal GU and SP if doc['mcc'] == '310': if doc['plmn'] != 'NaN': mnc = doc['plmn'][-3:] if (mnc in ['470', '140']) and \ (doc['lac'] in ['208','171','1', '10', '23', '24', '60', '66']): doc['country'] = 'SPGU' else: doc['country'] = 'US' else: doc['country'] = 'US' else: doc['country'] = mcc_country[doc['mcc']] else: doc['country'] = 'NaN' tmp.append(doc) count += 1 # 每2000条做一次数据插入 if count % 2000 == 0: insertTable(tmp, 't_terminal_flow_count_day_201701') print('insert {}\n'.format(count)) tmp = []
def update_vsim_estsucc_last_succ(): begin_datetime = generic_query( database('GSVC_SQL').get_db(), "SELECT MAX(update_time) FROM `t_term_vsim_estsucc_last_succ`")[0][0] begin_datetime = mkdatetime(str(begin_datetime)) now = datetime.datetime.now() today = datetime.datetime(year=now.year, month=now.month, day=now.day) mgo = database('PERF_MGO').get_db() succ_info = {} # 1. 循环更新succ信息, 按天避免查询范围过大。返回最终要插入的表 while begin_datetime < today: begin_time = datetime_timestamp(begin_datetime) end_time = datetime_timestamp(begin_datetime + datetime.timedelta(days=1)) # choose collection返回的是list, 此场景下只有一个值 col = choose_perf_collection(begin_datetime, begin_datetime + datetime.timedelta(days=1), prefix='t_term_vsim_estsucc')[0] match = {'createTime': {'$gte': begin_time, '$lt': end_time}} # $吐槽: 为了使用mongo自带的project重命名, 写pipeline的代价可是真的大 pipeline = [{ '$match': match }, { '$sort': { 'createTime': -1 } }, { '$group': { '_id': '$vsimImsi', 'succ_time': { '$first': '$succTime' }, 'succ_mcc': { '$first': '$mcc' }, 'succ_mnc': { '$first': '$mnc' }, 'succ_lac': { '$first': '$lac' }, } }, { '$project': { '_id': 0, 'imsi': '$_id', 'succ_time': 1, 'succ_mcc': 1, 'succ_mnc': 1, 'succ_lac': 1 } }] for info in mgo[col].aggregate(pipeline, allowDiskUse=True): if info['imsi'] == "": continue info['update_time'] = today # 记录更新日期 info['create_date'] = today # 插入数据库时生效,update不生效 if len(str(info['succ_time'])) != 13: info['succ_time'] = mkdatetime('1900-01-01') else: info['succ_time'] = timestamp_datetime(info['succ_time']) info['succ_country'] = mcc_country(info['succ_mcc'], info['succ_mnc'], info['succ_lac']) if info['imsi'] in succ_info.keys(): succ_info[info['imsi']] = info else: succ_info.update({info['imsi']: info}) begin_datetime += datetime.timedelta(days=1) # 2. format查询结果,便于插入与update succ_list = [x for x in succ_info.values()] succ_tuple_key = succ_list[1].keys() succ_tuple_val = [tuple(v.values()) for v in succ_list] update_stmt = ( "INSERT INTO `{target_table}` (`{colname}`) VALUES ({placeholder}) ON DUPLICATE KEY " "UPDATE `succ_time` = values(`succ_time`), " "`update_time` = values(`update_time`)," "`succ_lac` = values(`succ_lac`)," "`succ_mcc` = values(`succ_mcc`)," "`succ_country` = values(`succ_country`)," "`succ_mnc` = values(`succ_mnc`)").format( target_table='t_term_vsim_estsucc_last_succ', colname="`,`".join(succ_tuple_key), placeholder=','.join(['%s' for x in range(len(succ_tuple_key))])) con = database('GSVC_SQL_ADMIN').get_db() try: with con.cursor() as cur: effect_row = cur.executemany(update_stmt, succ_tuple_val) con.commit() except Exception as e: con.close() print('INSERT ROWS:{}'.format(effect_row)) raise e print('INSERT ROWS:{}'.format(effect_row)) con.close()