class MRPVDay(MRJob): ng_line_parser = NgLineParser() def mapper(self, _, line): self.ng_line_parser.parse(line) dy, tm = str(self.ng_line_parser.access_time).split() yield dy, 1 # 每一天的 yield 'total', 1 # 所有的 def reducer(self, key, values): yield key, sum(values)
class MRPVHour(MRJob): ng_line_parser = NgLineParser() def mapper(self, _, line): self.ng_line_parser.parse(line) dy, tm = str(self.ng_line_parser.access_time).split() h, m, s = tm.split(':') yield h, 1 # 每小时的 yield 'total', 1 # 所有的 def reducer(self, key, values): yield key, sum(values)
class MRUVCdnIpAddr(MRJob): OUTPUT_PROTOCOL = RawProtocol ng_line_parser = NgLineParser() def mapper(self, _, line): self.ng_line_parser.parse(line) yield self.ng_line_parser.cdn_ip, 1 def reducer_sum(self, key, values): """统计 VU""" yield None, [str(sum(values)), key] def init_ip_addr_df(self): """读取IP Addr 文件构造 DataFrame 文件""" cols = [ 'id', 'ip_start_num', 'ip_end_num', 'ip_start', 'ip_end', 'addr', 'operator' ] area_ip_path = '/root/script/nginx_log_parse/area_ip.csv' self.ip_addr_df = pd.read_csv(area_ip_path, sep='\t', names=cols, index_col='id') def reducer_top100(self, _, values): """访问数降序""" for cnt, ip in heapq.nlargest(100, values, key=lambda x: int(x[0])): ip_num = -1 try: # 将IP转化成INT/LONG 数字 ip_num = socket.ntohl( struct.unpack("I", socket.inet_aton(str(ip)))[0]) # 通过数字获得 地址 DataFrame addr_df = self.ip_addr_df[ (self.ip_addr_df.ip_start_num <= ip_num) & (ip_num <= self.ip_addr_df.ip_end_num)] # 通过索引值获得获得 地址 addr = addr_df.at[addr_df.index.tolist()[0], 'addr'] yield cnt, '{ip} {addr}'.format(ip=ip, addr=addr) except: yield cnt, ip def steps(self): return [ MRStep(mapper=self.mapper, reducer=self.reducer_sum), MRStep(reducer_init=self.init_ip_addr_df, reducer=self.reducer_top100) ]
class MRUrlReq(MRJob): ng_line_parser = NgLineParser() def mapper(self, _, line): self.ng_line_parser.parse(line) yield self.ng_line_parser.request_url, 1 # 请求网页 def reducer_sum(self, key, values): yield None, [sum(values), key] def reducer_top100(self, _, values): """访问数降序""" for cnt, path in heapq.nlargest(100, values): yield cnt, path def steps(self): return [ MRStep(mapper=self.mapper, reducer=self.reducer_sum), MRStep(reducer=self.reducer_top100) ]
class MRBrowser(MRJob): ng_line_parser = NgLineParser() def mapper(self, _, line): self.ng_line_parser.parse(line) yield self.ng_line_parser.browser, 1 def reducer_sum(self, key, values): """统计 VU""" yield None, [sum(values), key] def reducer_top100(self, _, values): """访问数降序""" for cnt, browser in heapq.nlargest(100, values): yield cnt, browser def steps(self): return [ MRStep(mapper=self.mapper, reducer=self.reducer_sum), MRStep(reducer=self.reducer_top100) ]
class MRUrlRef(MRJob): ng_line_parser = NgLineParser() def mapper(self, _, line): self.ng_line_parser.parse(line) yield self.ng_line_parser.reference_url, 1 # 外链域名 def reducer_sum(self, key, values): """统计 VU""" yield None, [sum(values), key] def reducer_desc(self, key, values): """访问数降序""" for cnt, value in sorted(values, reverse=True): yield cnt, value def steps(self): return [ MRStep(mapper=self.mapper, reducer=self.reducer_sum), MRStep(reducer=self.reducer_desc) ]
class PDNgLogStat(object): def __init__(self): self.ng_line_parser = NgLineParser() def _log_line_iter(self, pathes): """解析文件中的每一行并生成一个迭代器""" for path in pathes: with open(path, 'r') as f: for index, line in enumerate(f): self.ng_line_parser.parse(line) yield self.ng_line_parser.to_dict() # if index > 10: break def _ip2num(self, ip): """用于IP转化为数字""" ip_num = -1 try: # 将IP转化成INT/LONG 数字 ip_num = socket.ntohl(struct.unpack("I",socket.inet_aton(str(ip)))[0]) except: pass finally: return ip_num def _get_addr_by_ip(self, ip): """通过给的IP获得地址""" ip_num = self._ip2num(ip) try: addr_df = self.ip_addr_df[(self.ip_addr_df.ip_start_num <= ip_num) & (ip_num <= self.ip_addr_df.ip_end_num)] addr = addr_df.at[addr_df.index.tolist()[0], 'addr'] return addr except: return None def load_data(self, path): """通过给的文件路径加载数据生成 DataFrame""" self.df = pd.DataFrame(self._log_line_iter(path)) def pv_day(self): """计算每一天的 PV""" group_by_cols = ['access_time'] # 需要分组的列,只计算和显示该列 # 下面我们是按 yyyy-mm-dd 形式来分组的, 所以需要定义分组策略: # 分组策略为: self.df['access_time'].map(lambda x: x.split()[0]) pv_day_grp = self.df[group_by_cols].groupby( self.df['access_time'].map(lambda x: x.split()[0])) return pv_day_grp.agg(['count']) def pv_hour(self): """计算在一天当中每个时段的访问情况""" group_by_cols = ['access_time'] # 需要分组的列,只计算和显示该列 # 下面我们是按 hh(小时) 形式来分组的, 所以需要定义分组策略: # 分组策略为: self.df['access_time'].map(lambda x: x.split().pop().split(':')[0]) pv_hour_grp = self.df[group_by_cols].groupby( self.df['access_time'].map(lambda x: x.split().pop().split(':')[0])) return pv_hour_grp.agg(['count']) def url_ref_stat(self): """统计外链点击情况""" group_by_cols = ['reference_url'] # 需要分组的列,只计算和显示该列 # 直接统计次数 url_ref_grp = self.df[group_by_cols].groupby( self.df['reference_url']) return url_ref_grp.agg(['count'])['reference_url'].sort_values(by='count', ascending=False) def url_req_stat(self): """统计那个页面点击量""" group_by_cols = ['request_url'] # 需要分组的列,只计算和显示该列 # 直接统计次数 url_req_grp = self.df[group_by_cols].groupby( self.df['request_url']) return url_req_grp.agg(['count'])['request_url'].sort_values(by='count', ascending=False) def uv_cdn_ip(self, top = 100): """统计cdn ip量""" group_by_cols = ['cdn_ip'] # 需要分组的列,只计算和显示该列 # 直接统计次数 url_req_grp = self.df[group_by_cols].groupby( self.df['cdn_ip']) return url_req_grp.agg(['count'])['cdn_ip'].nlargest(top, 'count') def uv_real_ip(self, top = 100): """统计cdn ip量""" group_by_cols = ['real_ip'] # 需要分组的列,只计算和显示该列 # 直接统计次数 url_req_grp = self.df[group_by_cols].groupby( self.df['real_ip']) return url_req_grp.agg(['count'])['real_ip'].nlargest(top, 'count') def uv_cdn_ip_addr(self, top = 100): """统计cdn ip量 地址""" cnt_df = self.uv_cdn_ip(top) # 添加 ip 地址 列 cnt_df.insert(len(cnt_df.columns), 'addr', cnt_df.index.map(self._get_addr_by_ip)) return cnt_df def uv_real_ip_addr(self, top = 100): """统计real ip 地址量""" cnt_df = self.uv_real_ip(top) # 添加 ip 地址 列 cnt_df.insert(len(cnt_df.columns), 'addr', cnt_df.index.map(self._get_addr_by_ip)) return cnt_df def browser_stat(self): """统计不同浏览器访问次数""" group_by_cols = ['browser'] # 需要分组的列,只计算和显示该列 # 直接统计次数 url_req_grp = self.df[group_by_cols].groupby( self.df['browser']) return url_req_grp.agg(['count'])['browser'].nlargest(100, 'count') def load_ip_addr(self, path): """加载IP""" cols = ['id', 'ip_start_num', 'ip_end_num', 'ip_start', 'ip_end', 'addr', 'operator'] self.ip_addr_df = pd.read_csv(path, sep='\t', names=cols, index_col='id') return self.ip_addr_df
def __init__(self): self.ng_line_parser = NgLineParser()