Esempio n. 1
0
def get_logdata_df():
    '''
    获取所需写入数据库的数据
    返回dataframe,”用户,app,duration“
    '''
    file_path_all=get_file('conn')
    #file_path_all='D:\\ipv4&6\\00-pyTraff\\test_file\\conn.gz'
    #文件不存在直接return
    if not os.path.exists(file_path_all):
        return pd.DataFrame()
    #读取文件    
    df=read_file.pandas_normal_gz(file_path_all)
    df=df.iloc[:-1,[0,2,5]]#ts,origIP,resp_port
    df=df.dropna(how='any')
    grouped=df.groupby([2,5])
    db_port=get_data_base_port()#数据库中收录的app,
    #对组开始处理
    np_list=[]
    for gp in grouped.groups: 
        data_port=gp[1]
        
        if data_port in db_port.iterkeys():
            gp_df=grouped.get_group(gp)#求时间
            min_ts=gp_df.iloc[:,0].min()
            if min_ts<get_past_hour_ts(1):
                min_ts=get_past_hour_ts(1)
            max_ts=gp_df.iloc[:,0].max()
            duration=round(max_ts-min_ts,3)
            np_list.append([gp[0],db_port[data_port],duration]) 
    df_result=pd.DataFrame(np_list,columns=['user','app','duration'])
    #print df_result
    return df_result
Esempio n. 2
0
def get_conn_content():
    file_path_all = get_file('conn')
    #file_path_all='D:\\ipv4&6\\00-pyTraff\\test_file\\7_11_conn.gz'
    if not os.path.exists(file_path_all):
        return pd.DataFrame()
    df = read_file.pandas_normal_gz(file_path_all)
    df_drop = df.iloc[:, [1, 2, 17, 19]]
    df_drop.columns = ['fid', 'user', 'orig_ip_bytes', 'resp_ip_bytes']
    return df_drop
Esempio n. 3
0
def get_http_video():
    file_path_all = get_file('http')
    #file_path_all='D:\\ipv4&6\\00-pyTraff\\test_file\\7_17_http.gz'
    if not os.path.exists(file_path_all):
        return pd.DataFrame()
    df = read_file.pandas_normal_gz(file_path_all)
    df_drop = df.iloc[:, [0, 1, 12]]
    usagent_content = df_drop.dropna(axis=0)  #过滤内容为空的
    result = usagent_content.groupby([1, 12])  #fid,user-agent-content

    app_use = []  #fid,content
    for app in result.groups:
        fid = app[0]
        app_name = app[1]

        if app_name.startswith('Mozilla') or app_name.startswith(
                'Dalvik') or app_name.startswith(
                    'Safari') or app_name.startswith('Opera'):
            continue

        elif app_name.startswith('Youku') or app_name.startswith(
                'youku-tudou'):
            app_use.append([fid, 'Youku'])

        elif app_name.startswith('MGTV'):
            app_use.append([fid, 'MGTV'])

        elif app_name.startswith('SOHUVideo'):
            app_use.append([fid, 'SOHUVideo'])

        elif app_name.startswith('QYPlayer'):
            app_use.append([fid, 'QYPlayer'])

        elif app_name.startswith('qqlive'):
            app_use.append([fid, 'qqlive'])

        elif app_name.startswith('kwai'):
            app_use.append([fid, 'kwai'])

        elif app_name.startswith('PPStream'):
            app_use.append([fid, 'PPStream'])

        elif app_name.startswith('Letv'):
            app_use.append([fid, 'Letv'])

        elif app_name.startswith('Funshion'):
            app_use.append([fid, 'Funshion'])

        elif app_name.startswith('Xfplay'):
            app_use.append([fid, 'Xfplay'])

        else:
            continue
    df_app_use = pd.DataFrame(app_use, columns=['fid', 'video_name'])
    return df_app_use
Esempio n. 4
0
def get_data(filename):
    '''
    获取所要写入数据
    @param filename:文件路径
    @return:日志文件,总连接数,总流量,目标流量,源流量
    '''
    log_file=get_log_file_size()
    df=read_file.pandas_normal_gz(filename)
    conn_all=long(df[1].count())
    traff_resp=long(df[19].sum())
    traff_orig=long(df[17].sum())
    traff_all=traff_orig+traff_resp
    return log_file,conn_all,traff_all,traff_orig,traff_resp
Esempio n. 5
0
def get_log():
    '''
    获取log所需信息,返回df
    '''
    df=pd.DataFrame()
    file_path_all=get_file('conn')
    #file_path_all='test_file/conn.gz'
    if os.path.exists(file_path_all):
        log_df=read_file.pandas_normal_gz(file_path_all)
        df=log_df.iloc[:-1,[6,3,5,2]]
        df.rename(columns={6:'protocol',3:'orig_port',5:'resp_port',2:'orig_ip'},inplace = True)
    else:
        return df
    return df
Esempio n. 6
0
def get_data():
    '''
    获取所需有效日志文件数据
    @return :dataframe
    '''
    #调取read_file模块函数读取文件
    #df_conn=read_file.pandas_normal('conn.log')
    file_all_path=get_file()
    df_conn=read_file.pandas_normal_gz(file_all_path)
    df_conn.rename(columns={2:'orignIp',4:'respIp',17:'orign',19:'resp'},inplace=True)
    #获取所需要的信息
    df_conn_useful=df_conn.iloc[:-1,[2,4,17,19]]#uid,orig_ip_bytes,resp_ip_bytes
    df_all=df_conn_useful.groupby(['orignIp','respIp']).sum()
    df_all['results']=df_all['orign']+df_all['resp']
    df_results=df_all.sort_values(by='results',ascending=False).head(10)
    return df_results
Esempio n. 7
0
def get_data():
    '''
    获取所需有效日志文件数据
    @return :dataframe
    '''
    #调取read_file模块函数读取文件
    #df_conn=read_file.pandas_normal('conn.log')
    file_all_path=get_file()
    df_conn=read_file.pandas_normal_gz(file_all_path)
    df_conn.rename(columns={2:'orignIp',4:'respIp',17:'orign',19:'resp'},inplace=True)
    #获取所需要的信息
    df_conn_useful=df_conn.iloc[:-1,[2,4,17,19]]#uid,orig_ip_bytes,resp_ip_bytes
    df_all=df_conn_useful.groupby(['orignIp','respIp']).sum()
    df_all['results']=df_all['orign']+df_all['resp']
    df_results=df_all.sort_values(by='results',ascending=False).head(10)
    return df_results
Esempio n. 8
0
def get_data():
    '''
    获取所需数据
    '''
    file_path_all = get_file('conn')
    #file_path_all='conn.gz'
    if not os.path.exists(file_path_all):
        return 0
    else:
        all_data = read_file.pandas_normal_gz(file_path_all)
        value = all_data.iloc[:-1, [7, 17, 19]]  #service,orig_bytes,resp_bytes
        useful_data = value.dropna(how='any')  # 去掉包含缺失值的行
        useful_data.rename(columns={7: 'service'}, inplace=True)
        useful_data.rename(columns={17: 'orig_bytes'}, inplace=True)
        useful_data.rename(columns={19: 'resp_bytes'}, inplace=True)
        data = useful_data.groupby('service').sum()
        return data
Esempio n. 9
0
def get_logdata_df():
    '''
    获取所需写入数据库的数据
    返回dataframe,”用户,host,duration“
    '''
    pattern=#根据网络环境修改
    file_path_all=get_file('http')
    #file_path_all='test_file/http.gz'
    if not os.path.exists(file_path_all):
        return pd.DataFrame()
    df=read_file.pandas_normal_gz(file_path_all)
    df=df[df.iloc[:,2].str.match(pattern)]
    df=df.iloc[:-1,[0,2,8]]#ts,origIP,host
    df=df.dropna(how='any')
    grouped=df.groupby([2,8])
    db_host=get_data_base_host()#数据库中收录的网站host
    np_list=[]
    for gp in grouped.groups:
        data_host=gp[1].split('.')
        if len(data_host)>1:
            if data_host[-2] in ['com','cn','net','gov','org']:
                data_host=data_host[-3]
            else:
                data_host=data_host[-2]
        if data_host in db_host.iterkeys():
            gp_df=grouped.get_group(gp)
            min_ts=gp_df.iloc[:,0].min()
            if min_ts<get_past_hour_ts(1):
                min_ts=get_past_hour_ts(1)            
            max_ts=gp_df.iloc[:,0].max()
            np_list.append([gp[0],db_host[data_host],min_ts])
            np_list.append([gp[0],db_host[data_host],max_ts])
    df_result=pd.DataFrame(np_list,columns=['user','webhost','ts'])
    result_group=df_result.groupby(['user','webhost'])
    np_list2=[]
    for gp in result_group.groups:
        gp_df=result_group.get_group(gp)
        min_ts=gp_df.iloc[:,2].min()
        max_ts=gp_df.iloc[:,2].max()
        duration=round(max_ts-min_ts,3)
        np_list2.append([gp[0],gp[1],duration])   
    df_result=pd.DataFrame(np_list2,columns=['user','webhost','duration'])
    return df_result
Esempio n. 10
0
def get_log():
    '''
    获取log所需信息,返回df
    '''
    df = pd.DataFrame()
    file_path_all = get_file('conn')
    #file_path_all='test_file/conn.gz'
    if os.path.exists(file_path_all):
        log_df = read_file.pandas_normal_gz(file_path_all)
        df = log_df.iloc[:-1, [6, 3, 5, 2]]
        df.rename(columns={
            6: 'protocol',
            3: 'orig_port',
            5: 'resp_port',
            2: 'orig_ip'
        },
                  inplace=True)
    else:
        return df
    return df
def get_file_type():
    '''
	获取文件类型与累计出现次数
	返回一个df数据类型
	'''
    type = []
    file_type_count = []

    file_path_all = get_file('files')
    #file_path_all='D:\\ipv4&6\\00-pyTraff\\test_file\\12files.gz'
    if not os.path.exists(file_path_all):
        return pd.DataFrame()
    df = read_file.pandas_normal_gz(file_path_all)
    files_content = df.iloc[:, [8]]  #得到[file_type]
    files_content = files_content.dropna(axis=0)  #过滤内容为空的
    z = files_content.groupby([8]).size().sort_values().tail(15)
    for i in z.index:
        type.append(i)
    for j in z.values:
        file_type_count.append(j)
    result = zip(type, file_type_count)
    df_type_result = pd.DataFrame(result, columns=['file_type', 'count1'])
    return df_type_result
Esempio n. 12
0
def get_user_data():
    '''
    获取所需写入数据库的数据
    时间,originIP,流量开始时间,流量最后出现时间,上行流量,下行流量
    '''
    file_path_all=get_file('conn')
    #file_path_all='test_file/conn.gz'
    if not os.path.exists(file_path_all):
        return pd.DataFrame()
    df=read_file.pandas_normal_gz(file_path_all)
    df=db_config.filter_ip_df(df,2)
    df=df.iloc[:-1,[0,2,9,10]]#ts,origIP,origByte,respByte
    df=df.dropna(how='any')
    grouped=df.groupby(2)
    user_list=[]
    for gp in grouped.groups:
        gp_df=grouped.get_group(gp).iloc[:,[0,2,3]]
        min_ts=gp_df.iloc[:,0].min()
        max_ts=gp_df.iloc[:,0].max()
        outgoing=gp_df.iloc[:,1].sum()
        incoming=gp_df.iloc[:,2].sum()
        gpu=traffUser(gp,min_ts,max_ts,incoming,outgoing)
        user_list.append(gpu)
    return user_list
Esempio n. 13
0
def get_user_data():
    '''
    获取所需写入数据库的数据
    时间,originIP,流量开始时间,流量最后出现时间,上行流量,下行流量
    '''
    file_path_all=get_file('conn')
    #file_path_all='test_file/conn.gz'
    if not os.path.exists(file_path_all):
        return pd.DataFrame()
    df=read_file.pandas_normal_gz(file_path_all)
    df=db_config.filter_ip_df(df,2)
    df=df.iloc[:-1,[0,2,9,10]]#ts,origIP,origByte,respByte
    df=df.dropna(how='any')
    grouped=df.groupby(2)
    user_list=[]
    for gp in grouped.groups:
        gp_df=grouped.get_group(gp).iloc[:,[0,2,3]]
        min_ts=gp_df.iloc[:,0].min()
        max_ts=gp_df.iloc[:,0].max()
        outgoing=gp_df.iloc[:,1].sum()
        incoming=gp_df.iloc[:,2].sum()
        gpu=traffUser(gp,min_ts,max_ts,incoming,outgoing)
        user_list.append(gpu)
    return user_list