コード例 #1
0
    def _check_moves(board, pos):
        value = get_value(board, pos)
        if pos[0] < 0 or pos[0] >= len(board) or pos[1] < 0 or pos[1] >= len(
                board) or value != 0:
            return False

        return True
コード例 #2
0
ファイル: reducer.py プロジェクト: james-fu/wangqiwen
def main(input=sys.stdin, output=sys.stdout):
    format_list = [
        'uid', 'sessionid', 'stepid', 'time', 'position', 'source', 'action',
        'request', 'response', 'cellphone', 'other', 'dt'
    ]
    out_dict = {
        'uid': '-',
        'sessionid': '-',
        'stepid': '-',
        'time': '-',
        'position': '-',
        'query': '-',
        'result': '-',
        'click': '-',
        'cellphone': '-'
    }
    out_list = [
        'uid', 'sessionid', 'stepid', 'time', 'position', 'query', 'result',
        'click', 'cellphone'
    ]
    pattern_uid = re.compile(r'^[\w-]+$', re.I)
    last_key = '-'
    session_list = []

    for line in input:
        # in: [uid sessionid stepid time position source action request response cellphone other]
        query_dict = {}
        result_dict = {}
        click_dict = {}

        arr = [i.strip() for i in line.strip().split('\t')]
        if len(arr) != len(format_list):
            #print >>sys.stderr,'line length error ! %s!=%s \nline=(%s)'%(len(arr),len(format_list),line)
            continue
        line_dict = dict(zip(format_list, arr))
        for i in ('position', 'request', 'response', 'cellphone', 'other'):
            line_dict[i] = json.loads(line_dict[i])
        uid = line_dict['uid']
        sessionid = line_dict['sessionid']
        # 获取stepid
        try:
            stepid = int(line_dict['stepid'])
        except Exception, err:
            stepid = -1
        # 获取time
        time = line_dict['time']
        source = line_dict['source']
        # 获取query_type
        query_dict['query_type'] = line_dict['action']
        # 获取citycode
        #citycode = line_dict['position']['citycode'] if 'citycode' in line_dict['position'] else '-'
        if source == 'SP':
            # 提取信息: query(keywords,category),count,poi_ids
            # 获取检索query(keywords,category)
            for i in ('keywords', 'category'):  # keywords,category可能都存在
                if i in line_dict['request']:
                    query_dict[i] = func.get_value(line_dict['request'], i)
            if len(query_dict) < 2:
                # keywords,category均不存在时,跳过这条记录
                continue
            # 获取检索结果列表
            for i in ('count', 'poi_ids'):
                result_dict[i] = func.get_value(
                    line_dict['response'],
                    i) if i in line_dict['response'] else '-'
            # 2014-6-13 修复bug:无结果时,点击位置为1,因为poi_ids_list=['-'],,poiid='-'时误打误撞匹配上
            if result_dict['poi_ids'] == '-':
                poi_ids_list = []
            else:
                poi_ids_list = result_dict['poi_ids'].split('&')
            result_dict['poi_ids_list'] = poi_ids_list
            # 冗余信息
            request_str = func.dict2str(line_dict['request'])
            response_str = func.dict2str(line_dict['response'])
            position_str = func.dict2str(line_dict['position'])
            cellphone_str = func.dict2str(line_dict['cellphone'])

        elif source == 'AOS':
            # 提取信息: poiid
            if not 'poiid' in line_dict['request']:  # poiid不存在时,直接跳过
                continue
            poiid = func.get_value(line_dict['request'], 'poiid')
            click_dict['poiid'] = poiid
            # 冗余信息,补充到request中
            click_dict['time'] = time
            click_dict['stepid'] = stepid

        else:  # 异常类别,直接过滤
            continue
            pass
        uid_pass = 0
        len_sp = len(session_list)
        # 过滤异常用户 freq > 1w ,防止数据倾斜
        if len_sp > 10000:
            session_list = []
            uid_pass = 1
        # 输出:[uid,sessionid,stepid,time,query,result,click,request,response,position,cellphone]
        # uid+sessionid 作为key
        cur_key = uid + '\t' + sessionid
        if cur_key == last_key:
            if line_dict['source'] == 'SP':
                if uid_pass:
                    continue
                # key相同,追加记录到query-list中
                session_list.append([
                    uid, sessionid, stepid, time, query_dict, result_dict, {
                        'num': 0
                    }, request_str, response_str, position_str, cellphone_str
                ])
            elif line_dict['source'] == 'AOS':
                # 根据stepid,time降序排序
                #session_list.sort(key=lambda x:(x[2],x[3]),reverse=True)
                # 拿poiid去query-list查找展现信息: stepid=-1,去整个list中查;stepid!=-1,去前几步查
                #for i,item in enumerate(session_list[::-1]): # 2014-6-4 修复bug,点击poi不再展现list中,占比27%(25w/93w)
                for j in xrange(len_sp):  # session_list按照stepid倒序查找
                    i = -(j + 1)
                    item = session_list[i]
                    if stepid != -1 and stepid <= item[2]:
                        # stepid有效时,需要限制查找范围
                        continue
                    tmp_poi_list = item[5]['poi_ids_list']
                    if poiid in tmp_poi_list:
                        pos = tmp_poi_list.index(poiid) + 1
                        # 多点击、重复点击情形处理 2014-6-18
                        order = session_list[i][6]['num'] + 1
                        session_list[i][6]['num'] = order
                        # 记录点击信息(含多次点击):[order stepid time poiid pos] &&分隔展现项,||分隔展现元素
                        if order <= 1:
                            # 首次点击
                            session_list[i][6]['click_list'] = '|'.join([
                                str(order),
                                str(stepid), time, poiid,
                                str(pos)
                            ])
                        else:
                            # 非首次点击
                            session_list[i][6]['click_list'] += '&' + '|'.join(
                                [
                                    str(order),
                                    str(stepid), time, poiid,
                                    str(pos)
                                ])
                        #session_list[i][6]['poiid'] = poiid
                        #session_list[i][6]['pos'] = str(pos)
                        break  # 匹配成功后,停止查找---一个点击匹配最多一次检索行为
        else:  # key变化,清空当前query-list,初始化
            #session_list.sort(key=lambda x:(x[2],x[3]),reverse=True)
            for item in session_list:
                print parse(item)
                uid_pass = 0
            session_list = []  # 2014-6-23 bug修复,原来置空仅限source=sp
            if source == 'SP':
                session_list.append([
                    uid, sessionid, stepid, time, query_dict, result_dict, {
                        'num': 0
                    }, request_str, response_str, position_str, cellphone_str
                ])
                last_key = cur_key
            elif source == 'AOS':
                # AOS有点击,但SP无记录
                print >> sys.stderr, 'only in AOS (%s)' % (cur_key)
                pass
            else:
                continue
                pass
コード例 #3
0
ファイル: mapper.py プロジェクト: james-fu/wangqiwen
def main( input = sys.stdin , output = sys.stdout ):
    '''
    235702000010    addr_poi_merge:true+aos_version:2.12+app:sp_app+auto_cluster:false+cifa:800270049e7fd106ae72d101000000cc010300000000000000000000000000000000000000000000000000070000000500372e302e3409006950686f6e65362c3105004150504c450000000000000000000000000500+citysuggestion:true+data_type:poi+dic:c3320+dip:10920+diu:ac9b6b64-959e-4046-8b4b-615198ba06c1+diu2:63a8c5d9-93f2-4f6b-bad4-292e40e665a8+diu3:3908e66368f8407d6db29e833fd06abf82486d2b+div:iosh060400+expand_range:false+group_by_category:false+group_by_click:true+group_by_find_good_around_bus_station:true+group_by_name_component_result:true+group_by_parent:true+group_by_pos_standrand_order:true+group_by_whole_match:true+group_by_xy:true+group_by_xy_and_field:true+keywords:�ư�+location:true+name_replace:true+need_expand_range:true+noseg:parent;pguid;nid;brand_id;brand+page:1+page_num:10+qii:true+qii_server_port:14001+query_busline:true+query_channel:true+query_road:true+query_scene:category+query_src:amap6+query_type:rqbxy+range:5000.0+route_plan:true+search_operate:2+server_port:13333+session:104715881+show_fields:all+sort_filter:true+stepid:151+use_log:true+user_info:ac9b6b64-959e-4046-8b4b-615198ba06c1+user_loc:114.398544,30.501300+x:114.397759+y:30.500967+queryid=22263ea2-1b23-4617-bbdf-54b14df85c30 from:10.25.71.209+poi_ids:B001B0J0GS&B001B1ITO9&B0FFF0EGIZ&B0FFF0DQSJ&B001B16VTB&B0FFF0DQSI&B0FFF0EGWP&B0FFF0EWB2&B001B18O79&B001B1GZOU+qii_querytype:5+count:67+searchtime:81+totaltime:156
    '''
    # out ---> 【uid(用户标识,string) time(时间,string) position(地点,map) source(数据源,string) action(动作类别,string) request(请求信息,map) response(响应信息,map) other(其他信息,map)】
    format_list = ['tm','request','response','dt']
    cellphone_list = ['div','model','device','manufacture']
    out_dict = {'uid':'-','sessionid':'-','stepid':'-','time':'-','position':'-','source':'SP','action':'-','request':'-','response':'-','cellphone':'-','other':'-'}
    out_list = ['uid','sessionid','stepid','time','position','source','action','request','response','cellphone','other']
    pattern_uid = re.compile(r'^[\w-]+$',re.I)
    #illegal_uid = ('NULL','unknown','0','aos') # 2014.5.22 无效uid
    #illegal_uid = ('NULL','unknown','353021051343571','0','000000000000000','111111111111111','aos') # 2014.5.22 无效uid
    illegal_uid = ('NULL','unknown','aos')
    # 加载邮政编码映射字典
    city_dict = func.adcode2citycode()
    #print city_dict['adname'].keys()
    #print '|'.join(city_dict['adname'].keys())
    # 2014-6-17 加载泛需求字典
    general_dict = func.loadGeneralDict()
    #print '|'.join(general_dict.keys())
    #print format_list
    for line in input:
        position_dict = {}
        request_dict = {}
        response_dict = {}
        cellphone_dict = {}
        other_dict = {}
        '''
        # 2014.5.22 原始日志utf8编码,停止转换,否则造成部分数据乱码
        # [2014-5-8] 5月8日以后的日志才是flume utf-8编码,之前是ftp方式直接上传原始日志(gbk)
        # 编码转换: gbk -> utf8
        try:
            line = line.decode('gbk').encode('utf8')
        except Exception,err:
            pass
        # 2014-5-23
        if line.find('\t') != -1:
            arr = [ i.strip() for i in line.strip().split('\t') ]
        else: # 兼容2014-5-8转码后异常数据,空格分隔
            arr = [ i.strip() for i in line.strip().split(' ') ]
        '''
        arr = [ i.strip() for i in line.strip().split('\t') ]

        if len(arr) != len(format_list):
            func.counter('Count','line length error',1)
            #print >>sys.stderr,'line length error ! %s!=%s \nline=(%s)'%(len(arr),len(format_list),line)
            continue
        line_dict = dict(zip(format_list,arr))
        line_dict['request'] = json.loads(arr[1])
        line_dict['response'] = json.loads(arr[2])
        tm = line_dict['tm']
        if len(tm) > 6:
            out_dict['time'] = tm[0:2]+':'+tm[2:4]+':'+tm[4:6]
        request_dict = line_dict['request']
        response_dict = line_dict['response']
        if not request_dict or 'query_type' not in request_dict:
            func.counter('Count','query_type miss',1)
            continue
	#2014-11-19 过滤抓取日志
	if request_dict.get('user_info','-') == 'test' and request_dict.get('query_src','-') == 'test':
	    continue
        if 'user_info' in request_dict and pattern_uid.match(request_dict['user_info']):
            uid = func.get_value(request_dict,'user_info')
        elif 'diu' in request_dict and pattern_uid.match(request_dict['diu']):
            uid = func.get_value(request_dict,'diu')
        else: # 2014-6-17 用户标识缺失时,直接跳过
            continue
        # 2014-6-17,过滤码点日志,占全量日志的1/3
        if request_dict['query_type'] == 'indoor_slayer':
            continue
        # 2014.5.22
        if uid in illegal_uid:
            func.counter('Count','uid error',1)
            continue
        out_dict['uid'] = uid  #2014-09-05 不强制转大写
	#2014-11-14 add sessionid
	if 'session' in request_dict: 
            out_dict['sessionid'] = func.get_value(request_dict,'session')
	elif 'sessionid' in request_dict:
	    out_dict['sessionid'] = func.get_value(request_dict,'sessionid')
	else:
	    out_dict['sessionid'] = '-'
        if 'step' in request_dict:
            out_dict['stepid'] = func.get_value(request_dict,'step')
        elif 'stepid' in request_dict:
            out_dict['stepid'] = func.get_value(request_dict,'stepid')
        else:
            out_dict['stepid'] = '-'
        out_dict['action'] = func.get_value(request_dict,'query_type').upper()
        if 'data_type' in request_dict:
            request_dict['data_type'] = request_dict['data_type'].upper()
            other_dict['data_type'] = request_dict['data_type'].upper() # 2014-09-05 add data_type to other
        # cifa解析  2014-5-5
        cifa_dict = {}
        if 'cifa' in request_dict:
            cifa_str = func.get_value(request_dict,'cifa')
            cifa_dict = cifa.parse_cifa(cifa_str) # 解密cifa

        for i in ('diu','diu2','diu3'):
            other_dict[i] = func.get_value(request_dict,i)
        for i in ('x','y','user_loc','geoobj'):
            position_dict[i] = func.get_value(request_dict,i)
            # 用cifa中的lon,lat填充user_loc值
            if i == 'user_loc' and position_dict[i] == '-' and 'lon' in cifa_dict and 'lat' in cifa_dict:
                position_dict[i] = str(float(cifa_dict['lon'])/10**6)+','+str(float(cifa_dict['lat'])/10**6)
        # RGEOCODE  2014-5-7
        geo_list = []
        citycode = '-'
        if request_dict.has_key('city'):
            # 请求数据中自带city时,做相应转换后,赋给citycode
            city = func.get_value(request_dict,'city').strip() # city中文后有空格,去掉  2014-5-15
            # 根据不同情形做相应转换
            len_city = len(city)
            if len_city <= 1: # 2014-6-17 '==' -> '<=',校正city为空的情形
                city = '-'
            elif city.isdigit():
                if len_city == 2 or ( len_city == 3 and not city.startswith('0') ):
                    city = '0' + city
                elif len_city == 6:
                    # 邮政编码转城市编码
                    try:
                        city = city_dict['adcode'][city]
                    except Exception,err:
                        print >>sys.stderr,'adcode -> citycode error! (%s)'%(err)
                        pass
            else: # 中文转城市编码,2014-6-17,增加市前先判断是否存在
                if not city.endswith('市') and city not in city_dict['adname']:
                    city += '市'
                try:
                    city = city_dict['adname'][city]
                    #city = city_dict['adname'][city.decode('utf8').encode('gbk')]
                except Exception,err:
                    print >>sys.stderr,'adname -> citycode error! (%s,%s)'%(city,err)
                    pass
            citycode = city
コード例 #4
0
ファイル: mapper.py プロジェクト: james-fu/wangqiwen
            # user_loc:114.398544,30.501300
            user_loc_list = position_dict['user_loc'].split(',')
            if len(user_loc_list) == 2:
                try:
                    tmp_citycode = func.get_citycode(user_loc_list)
                except Exception,err:
                    print >>sys.stderr,'Regeo ERROR ! (%s)'%(repr(user_loc_list))
                    tmp_citycode = []
                if tmp_citycode:
                    user_loc_city = tmp_citycode[0]
        position_dict['user_loc_city'] = user_loc_city
        # 手机相关信息
        for i in cellphone_list:
            if i in cifa_dict:
                if i == 'div':  # [2014-5-21] div 大写
                    cellphone_dict[i] = func.get_value(cifa_dict,i).upper()
                else:
                    cellphone_dict[i] = func.get_value(cifa_dict,i)
            else:
                if i == 'div':
                    cellphone_dict[i] = func.get_value(request_dict,i).upper()
                else:
                    cellphone_dict[i] = func.get_value(request_dict,i)  #2014-09-05  cifa_dict->request_dict
        # 2014-6-17  新增字段is_general标记是否泛需求
        if 'keywords' in request_dict:
            if request_dict['keywords'] in general_dict:
                request_dict['is_general'] = '1'

        request_dict['cifa'] = func.dict2str(cifa_dict,';','=')
        for i in ('position','request','response','cellphone','other'):
            out_dict[i] = func.dict2str(eval("%s_dict"%(i)))
コード例 #5
0
ファイル: mapper.py プロジェクト: james-fu/wangqiwen
def main( input = sys.stdin , output = sys.stdout ):
    # input: diu    date    time    path    para
    # output: uid   time    position  source action request response other; partition: dt
    # 359188049115769   2014-01-07  00:00:00    /ASS    t=traf
    out_list = ['uid','sessionid','stepid','time','position','source','action','request','response','cellphone','other']
    #format_str = ('date','time','method','path','para','code','size','os','resp_time')
    # 2014.6.13 只保留部分有用字段
    format_str = ('date','time','path','para','os','resp_time')
    # 2014.6.13 修改正则,兼容分区字段dt,解决query-click点击量为0的问题
    pattern_apache = re.compile(r"^.*?\s+.*?\s+.*?\[(.*?)\s+(.*?)\..*?\]\s+\".*?\s+(.*?)\?(.*?)\s+HTTP.*\"\s+.*?\s+.*?\s+\".*?\"\s+\"(.*?)\"\s+(.*?)\t.*?$",re.I)
    #pattern_apache = re.compile(r"^.*?\s+.*?\s+.*?\[(.*?)\s+(.*?)\..*?\]\s+\"(.*?)\s+(.*?)\?(.*?)\s+HTTP.*\"\s+(.*?)\s+(.*?)\s+\".*?\"\s+\"(.*?)\"\s+(.*?)$",re.I)
    pattern_uid = re.compile(r'^[\w-]+$',re.I)
    # 2014-5-16 cifa信息移动至cellphone
    #para_dict = {'uid':'diu','sessionid':'session','stepid':'stepid','position':['user_loc','geoobj'],'cellphone':['div'],'other':['diu2','diu3','tid']} #2014-09-02 add tid
    para_dict = {'uid':'diu','sessionid':'session','stepid':'stepid','cellphone':['div'],'other':['diu2','diu3','tid']} #2014-09-02 add tid
    position_key = ('user_loc','geoobj')

    # 加载adcode转citycode表
    adcodeDict = func.adcode2citycode()
    # 加载aos url映射表
    aos_dict = {}
    #aos_file = '../../../tool/aos.txt'
    aos_file = 'aos.txt'
    for line in file(aos_file):
        arr = [i.strip() for i in line.strip().split('\t')]
        key,value = arr[0:2]
        if len(arr) != 3:
            #20141112,garnett
            #print >>sys.stderr,'aos dict line error !(%s)'%(line)
            continue
        aos_dict[key.rstrip('/')] = value

    in_dict = {}
    for line in input:
        #line = line.replace("%0A","").replace("%0D","")
        p = pattern_apache.match(line)
        if not p:
            #20141112,garnett
            #func.counter('Count','line pattern miss',1)
            continue
        out_dict = {}
        in_dict = dict(zip(format_str,p.groups()))
        # 2014-09-05  del test data
        if in_dict['os'] == 'autonavi-ssl-scanner':
            continue
        #out_dict['time'] = in_dict['time']
        out_dict['time'] = func.get_value(in_dict,'time') # 2014-6-11
        para = urllib.unquote(func.get_value(in_dict,'para'))
        tmp_para = func.str2dict(para,'&','=')
        if 'diu' not in tmp_para or not pattern_uid.match(tmp_para['diu']) or tmp_para['diu'] == 'null' :
            # diu missed  [2014-3-17]
            #20141112,garnett
            #print >>sys.stderr,'diu missed ! (%s)'%(repr(tmp_para))
            #func.counter('Count','diu miss|match',1)
            continue
		#2014-09-02 解析searchhomepage中参数
        for k in tmp_para.keys():
            if k.startswith('shp_'):
                if k[4:] not in tmp_para:
                    tmp_para[k[4:]] = tmp_para[k].strip()
                    del tmp_para[k]
                else:
                    tmpkey = k[4:] + "newest"
                    tmp_para[tmpkey] = tmp_para[k].strip()
                    tmp_para[k[4:]] = tmp_para[k].strip()
                    del tmp_para[k]

            else:
                tmp_para[k] = tmp_para[k].strip()
        for k in para_dict:
            v = para_dict[k]
            if type(v) == type('str'):
                if v not in tmp_para:
                    if v == 'stepid':
                        # 2014-7-21 step -> stepid
                        out_dict[k] = func.get_value(tmp_para,'step')
                    elif v=='session':
                        #2014-09-02 sessionid-> sessionid
                        out_dict[k] = func.get_value(tmp_para,'sessionid')
                    else:
                        out_dict[k] = '-'
                else: # 2014-5-16 para参数取出
                    out_dict[k] = func.get_value(tmp_para,v)
            elif type(v) == type([]):
                out_dict[k] = {}
                for i in v:
                    if i in tmp_para:
                        out_dict[k][i] = func.get_value(tmp_para,i)
            else:
                #20141112,garnett 
                #print >>sys.stderr,'Illegal key(%s) found !'%(k)
                #func.counter('Count','illegal key',1)
                continue
        # 获取位置信息
        x, y = '-','-'
        for i  in ('x','lon','longitude'):
            if i in tmp_para and tmp_para[i] not in ('','-'):
                x = func.get_value(tmp_para,i)
                break
        for i  in ('y','lat','latitude'):
            if i in tmp_para and tmp_para[i] not in ('','-'):
                y = func.get_value(tmp_para,i)
                break
        # 2014-5-16,挪动cifa信息到other中,与sp保持一致
        # cifa里的经纬度赋给position,manufacture,model等赋给cellphone
        cifa_str = func.get_value(tmp_para,'cifa')
        cifa_dict = func.str2dict(cifa_str,';','=') #2014-09-02 把字符串cifa_dict转化成dict
        if x == '-' and 'lon' in cifa_dict:
            lon = func.get_value(cifa_dict,'lon')
            try:
                x = str(float(lon)/10**6)
            except Exception,err:
                pass
        if y == '-' and 'lat' in cifa_dict:
            lat = func.get_value(cifa_dict,'lat')
            try:
                y = str(float(lat)/10**6)
            except Exception,err:
                pass
コード例 #6
0
ファイル: mapper.py プロジェクト: james-fu/wangqiwen
                y = str(float(lat)/10**6)
            except Exception,err:
                pass

        out_dict['position'] = {}
        out_dict['position']['x'] = x
        out_dict['position']['y'] = y
        # user_loc:114.398544,30.501300
        try:
            adcode = xy2ccode.xy2ccode(float(x),float(y))
        except Exception:
            adcode = "-"
        out_dict['position']['citycode'] = adcodeDict.get('adcode').get(adcode,adcode)

        for i in ('device','model','manufacture'):
            out_dict['cellphone'][i] = func.get_value(cifa_dict,i)
        out_dict['cellphone']['cifa'] = cifa_str #2014-09-02

        for k in position_key:
            if not tmp_para.has_key(k):
                #20141112,garnett
                #func.counter('Count','position key miss',1)
                continue
            if k == 'geoobj':
                out_dict['position'][k] = func.get_value(tmp_para,k).replace('|',';')

            else:
                out_dict['position'][k] = func.get_value(tmp_para,k)


        out_dict['source'] = "AOS" # source
コード例 #7
0
ファイル: old_mapper.py プロジェクト: james-fu/wangqiwen
 # 2014-6-19 去掉sug测试日志
 if 'query_src' in in_dict['req1'] and in_dict['req1'][
         'query_src'] == 'test':
     continue
 request_dict.update(in_dict['req2'])
 request_dict['tid'] = in_dict['tid']
 request_dict['aos_verion'] = in_dict['sysinfo'][
     'aos_version'] if 'aos_version' in in_dict['sysinfo'] else '-'
 response_dict['spend_time'] = in_dict['sysinfo'][
     'spend_time'] if 'spend_time' in in_dict['sysinfo'] else '-'
 #print response_dict
 # uid
 out_dict['uid'] = '-'
 for i in ('user_info', 'diu'):
     if i in request_dict and request_dict[i] != '-':
         out_dict['uid'] = func.get_value(
             request_dict, i).upper()  # 2014-6-21 sug的uid都转大写,同sp
         break
 if not pattern_uid.match(out_dict['uid']) and out_dict['uid'] != '-':
     func.counter('Count', 'uid illegal pass', 1)
     continue
 # sessionid,stepid
 for i in ('sessionid', 'stepid'):
     out_dict[i] = func.get_value(request_dict, i)
 # time
 if len(in_dict['time']) == 14:
     date = in_dict['time'][:8]
     time = in_dict['time'][8:10] + ':' + in_dict['time'][
         10:12] + ':' + in_dict['time'][12:]
 else:
     date, time = '-', '-'
 out_dict['time'] = time
コード例 #8
0
def main(input=sys.stdin, output=sys.stdout):
    # 非法diu
    # 加载码表
    #pb_dict = json.loads(open('../../page-button/json.txt','r').read())
    pb_dict = json.loads(open('page-button.json', 'r').read())
    pb_key_list = pb_dict.keys()
    # out ---> 【uid(用户标识,string) time(时间,string) position(地点,map) source(数据源,string) action(动作类别,string) request(请求信息,map) response(响应信息,map) other(其他信息,map)】
    format_str = 'id||diu||div||aid||source||service||page||button||action||time||session||x||y||para||protocol_version||diu2||diu3||dic||model||device||manufacture||stepid'
    format_list = format_str.split('||')
    cellphone_list = ['div', 'model', 'device', 'manufacture']
    out_dict = {
        'uid': '-',
        'sessionid': '-',
        'stepid': '-',
        'time': '-',
        'position': '-',
        'source': 'CLIENT',
        'action': '-',
        'request': '-',
        'response': '-',
        'cellphone': '-',
        'other': '-'
    }
    out_list = [
        'uid', 'sessionid', 'stepid', 'time', 'position', 'source', 'action',
        'request', 'response', 'cellphone', 'other'
    ]
    #para_seg = 'download_rate,ip,start_time,method,url,version,result,start_receive_time,data_size,end_time' # ip,start_receive_time不一定有
    pattern_uid = re.compile(r'^[\w-]+$', re.I)
    #print format_list
    for line in input:
        position_dict = {}
        request_dict = {}
        response_dict = {}
        cellphone_dict = {}
        other_dict = {}
        arr = [i.strip() for i in line.strip().split('||')]
        # 2014-5-23 add dt
        dt = arr[-1].split('\t')[-1]
        arr[-1] = arr[-1].split('\t')[0]

        if len(arr) != len(format_list):
            print >> sys.stderr, 'line length error ! %s!=%s \n\t(%s)' % (
                len(arr), len(format_list), line)
            func.counter('Count', 'line length error', 1)
            continue
        line_dict = dict(zip(format_list, arr))
        out_dict['uid'] = func.get_value(line_dict,
                                         'diu') if 'diu' in line_dict else '-'
        if not pattern_uid.match(out_dict['uid']):
            print >> sys.stderr, 'diu(%s) illegal ! pass ...' % (
                out_dict['uid'])
            func.counter('Count', 'uid miss match', 1)
            continue
        if 'page' not in line_dict or 'button' not in line_dict:
            func.counter('Count', 'page button miss', 1)
            print >> sys.stderr, 'miss page or button ! pass ...(%s)' % (line)
            continue
        page = func.get_value(line_dict, 'page')
        button = func.get_value(line_dict, 'button')
        out_dict['sessionid'] = func.get_value(
            line_dict, 'session') if 'session' in line_dict else '-'
        if 'step' in line_dict:
            out_dict['stepid'] = func.get_value(line_dict, 'step')
        elif 'stepid' in line_dict:
            out_dict['stepid'] = func.get_value(line_dict, 'stepid')
        else:
            out_dict['stepid'] = '-'
        other_dict['diu2'] = func.get_value(line_dict, 'diu2')
        other_dict['diu3'] = func.get_value(line_dict, 'diu3')
        position_dict['x'] = func.get_value(line_dict, 'x')
        position_dict['y'] = func.get_value(line_dict, 'y')

        explain = '-'  # 动作解释
        devi = '-'
        # 根据ver(div)区分系统,IOSH060100,ANDH060000 --[新]
        div = line_dict['div'].upper()  # os类别
        if len(div) < 10:
            print >> sys.stderr, 'div error ! div=(%s) pass' % (div)
            func.counter('Count', 'div error', 1)
            continue
        ver = div[-5:]
        # div对应地图不同硬件版本:IOS(H,P),WIN(H,P),ANDH,BLBH
        if div.startswith('IOS'):
            os = 'ios'
        elif div.startswith('AND'):
            os = 'android'
        else:
            pass
        line_dict['os'] = os
        explain = '-'
        out_dict['action'] = 'page=%s|button=%s' % (page, button)
        other_dict['action_name'] = '-'
        try:
            # 不同app版本的时间戳解析方法不同. [2014-3-12]garnett反馈,客户端时间解析存在8h误差,经瑞娟确认,修改解析方法
            if ver < '60200':
                t_list = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.gmtime(
                        int(line_dict['time']) + 1293811200 +
                        8 * 3600)).split()
                # 2014-5-13 620以下的版本直接查620码表
                new_div = div[0:4] + '060200'
            else:
                t_list = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.gmtime(
                        int(line_dict['time']) / 1000 + 1293811200 +
                        8 * 3600)).split()
                # 2014-5-13 620以上的数据才查码表 (不同版本码表不同)
                if div in pb_key_list:
                    new_div = div
                else:
                    # 中间版本,往上一个二位版本聚合
                    new_div = div[:-2] + '00'
        except Exception:
            print >> sys.stderr, 'time.strtime error ! (%s)' % (
                line_dict['time'])
            t_list = ['-', '-']

        try:
            explain = pb_dict[new_div][page][button]['explain']
        except Exception, err:
            print >> sys.stderr, '码表查找失败!div=%s,page=%s,button=%s' % (
                div, page, button)
        if explain in ('网页日志 ', '联网日志', '网络事件->网络事件'):
            #           android:1000,0  2000,0   IOS:2000,0
            func.counter('Count', 'pass action', 1)
            continue
        other_dict['action_name'] = explain
        out_dict['time'] = t_list[1]
        other_dict['date'] = t_list[0].replace(
            '-', '')  # [2014-3-17] 2014-03-17 --> 20140317
        if line_dict['para'] != '':
            try:
                #para_dict = json.loads(arr[13])
                para_dict = json.loads(func.get_value(line_dict, 'para'))
            except Exception, err:
                #print >>sys.stderr,'[error] json data:para=[%s]'%(arr[13])
                func.counter('Count', 'para miss', 1)
                continue
コード例 #9
0
        other_dict['date'] = t_list[0].replace(
            '-', '')  # [2014-3-17] 2014-03-17 --> 20140317
        if line_dict['para'] != '':
            try:
                #para_dict = json.loads(arr[13])
                para_dict = json.loads(func.get_value(line_dict, 'para'))
            except Exception, err:
                #print >>sys.stderr,'[error] json data:para=[%s]'%(arr[13])
                func.counter('Count', 'para miss', 1)
                continue
        else:
            #print >>sys.stderr,'para empty!(%s)'%(line)
            para_dict = {}
            pass
        # 手机相关信息
        for i in cellphone_list:
            cellphone_dict[i] = func.get_value(line_dict, i)
        # 请求信息
        request_dict = line_dict
        out_dict['position'] = func.dict2str(position_dict)
        out_dict['request'] = func.dict2str(request_dict)
        out_dict['response'] = func.dict2str(para_dict)
        out_dict['cellphone'] = func.dict2str(cellphone_dict)
        out_dict['other'] = func.dict2str(other_dict)
        print >> output, '\t'.join([out_dict[i] for i in out_list])
    #print json.dumps(tmp_dict,ensure_ascii=False,encoding='utf-8',indent=4)


if __name__ == '__main__':
    main()