Exemple #1
0
class MRPVDay(MRJob):

    ng_line_parser = NgLineParser()

    def mapper(self, _, line):
        self.ng_line_parser.parse(line)
        dy, tm = str(self.ng_line_parser.access_time).split()
        yield dy, 1 # 每一天的
        yield 'total', 1 # 所有的

    def reducer(self, key, values):
        yield key, sum(values)
Exemple #2
0
class MRPVHour(MRJob):

    ng_line_parser = NgLineParser()

    def mapper(self, _, line):
        self.ng_line_parser.parse(line)
        dy, tm = str(self.ng_line_parser.access_time).split()
        h, m, s = tm.split(':')
        yield h, 1  # 每小时的
        yield 'total', 1  # 所有的

    def reducer(self, key, values):
        yield key, sum(values)
Exemple #3
0
class MRUVCdnIpAddr(MRJob):

    OUTPUT_PROTOCOL = RawProtocol
    ng_line_parser = NgLineParser()

    def mapper(self, _, line):
        self.ng_line_parser.parse(line)
        yield self.ng_line_parser.cdn_ip, 1

    def reducer_sum(self, key, values):
        """统计 VU"""
        yield None, [str(sum(values)), key]

    def init_ip_addr_df(self):
        """读取IP Addr 文件构造 DataFrame 文件"""
        cols = [
            'id', 'ip_start_num', 'ip_end_num', 'ip_start', 'ip_end', 'addr',
            'operator'
        ]
        area_ip_path = '/root/script/nginx_log_parse/area_ip.csv'
        self.ip_addr_df = pd.read_csv(area_ip_path,
                                      sep='\t',
                                      names=cols,
                                      index_col='id')

    def reducer_top100(self, _, values):
        """访问数降序"""

        for cnt, ip in heapq.nlargest(100, values, key=lambda x: int(x[0])):
            ip_num = -1
            try:
                # 将IP转化成INT/LONG 数字
                ip_num = socket.ntohl(
                    struct.unpack("I", socket.inet_aton(str(ip)))[0])
                # 通过数字获得 地址 DataFrame
                addr_df = self.ip_addr_df[
                    (self.ip_addr_df.ip_start_num <= ip_num)
                    & (ip_num <= self.ip_addr_df.ip_end_num)]
                # 通过索引值获得获得 地址
                addr = addr_df.at[addr_df.index.tolist()[0], 'addr']
                yield cnt, '{ip}    {addr}'.format(ip=ip, addr=addr)
            except:
                yield cnt, ip

    def steps(self):
        return [
            MRStep(mapper=self.mapper, reducer=self.reducer_sum),
            MRStep(reducer_init=self.init_ip_addr_df,
                   reducer=self.reducer_top100)
        ]
Exemple #4
0
class MRUrlReq(MRJob):

    ng_line_parser = NgLineParser()

    def mapper(self, _, line):
        self.ng_line_parser.parse(line)
        yield self.ng_line_parser.request_url, 1  # 请求网页

    def reducer_sum(self, key, values):
        yield None, [sum(values), key]

    def reducer_top100(self, _, values):
        """访问数降序"""
        for cnt, path in heapq.nlargest(100, values):
            yield cnt, path

    def steps(self):
        return [
            MRStep(mapper=self.mapper, reducer=self.reducer_sum),
            MRStep(reducer=self.reducer_top100)
        ]
Exemple #5
0
class MRBrowser(MRJob):

    ng_line_parser = NgLineParser()

    def mapper(self, _, line):
        self.ng_line_parser.parse(line)
        yield self.ng_line_parser.browser, 1

    def reducer_sum(self, key, values):
        """统计 VU"""
        yield None, [sum(values), key]

    def reducer_top100(self, _, values):
        """访问数降序"""
        for cnt, browser in heapq.nlargest(100, values):
            yield cnt, browser

    def steps(self):
        return [
            MRStep(mapper=self.mapper, reducer=self.reducer_sum),
            MRStep(reducer=self.reducer_top100)
        ]
Exemple #6
0
class MRUrlRef(MRJob):

    ng_line_parser = NgLineParser()

    def mapper(self, _, line):
        self.ng_line_parser.parse(line)
        yield self.ng_line_parser.reference_url, 1  # 外链域名

    def reducer_sum(self, key, values):
        """统计 VU"""
        yield None, [sum(values), key]

    def reducer_desc(self, key, values):
        """访问数降序"""
        for cnt, value in sorted(values, reverse=True):
            yield cnt, value

    def steps(self):
        return [
            MRStep(mapper=self.mapper, reducer=self.reducer_sum),
            MRStep(reducer=self.reducer_desc)
        ]
Exemple #7
0
class PDNgLogStat(object):

    def __init__(self):
        self.ng_line_parser = NgLineParser()

    def _log_line_iter(self, pathes):
        """解析文件中的每一行并生成一个迭代器"""
        for path in pathes:
            with open(path, 'r') as f:
                for index, line in enumerate(f):
                    self.ng_line_parser.parse(line)
                    yield self.ng_line_parser.to_dict()

                    # if index > 10: break

    def _ip2num(self, ip):
        """用于IP转化为数字"""
        ip_num = -1
        try:
            # 将IP转化成INT/LONG 数字
            ip_num = socket.ntohl(struct.unpack("I",socket.inet_aton(str(ip)))[0])
        except:
            pass
        finally:
           return ip_num

    def _get_addr_by_ip(self, ip):
        """通过给的IP获得地址"""
        ip_num = self._ip2num(ip)

        try:
            addr_df = self.ip_addr_df[(self.ip_addr_df.ip_start_num <= ip_num) & 
                                      (ip_num <= self.ip_addr_df.ip_end_num)]
            addr = addr_df.at[addr_df.index.tolist()[0], 'addr']
            return addr
        except:
            return None

    def load_data(self, path):
        """通过给的文件路径加载数据生成 DataFrame"""
        self.df = pd.DataFrame(self._log_line_iter(path))

    def pv_day(self):
        """计算每一天的 PV"""
        group_by_cols = ['access_time'] # 需要分组的列,只计算和显示该列
        
        # 下面我们是按 yyyy-mm-dd 形式来分组的, 所以需要定义分组策略:
        # 分组策略为: self.df['access_time'].map(lambda x: x.split()[0])
        pv_day_grp = self.df[group_by_cols].groupby(
                       self.df['access_time'].map(lambda x: x.split()[0]))
        return pv_day_grp.agg(['count'])

    def pv_hour(self):
        """计算在一天当中每个时段的访问情况"""
        group_by_cols = ['access_time'] # 需要分组的列,只计算和显示该列
        
        # 下面我们是按 hh(小时) 形式来分组的, 所以需要定义分组策略:
        # 分组策略为: self.df['access_time'].map(lambda x: x.split().pop().split(':')[0])
        pv_hour_grp = self.df[group_by_cols].groupby(
                       self.df['access_time'].map(lambda x: x.split().pop().split(':')[0]))
        return pv_hour_grp.agg(['count'])

    def url_ref_stat(self):
        """统计外链点击情况"""
        group_by_cols = ['reference_url'] # 需要分组的列,只计算和显示该列
        
        # 直接统计次数
        url_ref_grp = self.df[group_by_cols].groupby(
                                     self.df['reference_url'])
        return url_ref_grp.agg(['count'])['reference_url'].sort_values(by='count', ascending=False)

    def url_req_stat(self):
        """统计那个页面点击量"""
        group_by_cols = ['request_url'] # 需要分组的列,只计算和显示该列
        
        # 直接统计次数
        url_req_grp = self.df[group_by_cols].groupby(
                                     self.df['request_url'])
        return url_req_grp.agg(['count'])['request_url'].sort_values(by='count', ascending=False)

    def uv_cdn_ip(self, top = 100):
        """统计cdn ip量"""
        group_by_cols = ['cdn_ip'] # 需要分组的列,只计算和显示该列
        
        # 直接统计次数
        url_req_grp = self.df[group_by_cols].groupby(
                                     self.df['cdn_ip'])
        return url_req_grp.agg(['count'])['cdn_ip'].nlargest(top, 'count')

    def uv_real_ip(self, top = 100):
        """统计cdn ip量"""
        group_by_cols = ['real_ip'] # 需要分组的列,只计算和显示该列
        
        # 直接统计次数
        url_req_grp = self.df[group_by_cols].groupby(
                                     self.df['real_ip'])
        return url_req_grp.agg(['count'])['real_ip'].nlargest(top, 'count')

    def uv_cdn_ip_addr(self, top = 100):
        """统计cdn ip量 地址"""
        cnt_df = self.uv_cdn_ip(top)
        
        # 添加 ip 地址 列
        cnt_df.insert(len(cnt_df.columns),
                      'addr',
                      cnt_df.index.map(self._get_addr_by_ip))
        return cnt_df

    def uv_real_ip_addr(self, top = 100):
        """统计real ip 地址量"""
        cnt_df = self.uv_real_ip(top)

        # 添加 ip 地址 列
        cnt_df.insert(len(cnt_df.columns),
                      'addr',
                      cnt_df.index.map(self._get_addr_by_ip))
        return cnt_df

    def browser_stat(self):
        """统计不同浏览器访问次数"""
        group_by_cols = ['browser'] # 需要分组的列,只计算和显示该列
        
        # 直接统计次数
        url_req_grp = self.df[group_by_cols].groupby(
                                     self.df['browser'])
        return url_req_grp.agg(['count'])['browser'].nlargest(100, 'count')

    def load_ip_addr(self, path):
        """加载IP"""
        cols = ['id', 'ip_start_num', 'ip_end_num',
                'ip_start', 'ip_end', 'addr', 'operator']
        self.ip_addr_df = pd.read_csv(path, sep='\t', names=cols, index_col='id')
        return self.ip_addr_df
Exemple #8
0
 def __init__(self):
     self.ng_line_parser = NgLineParser()