Esempio n. 1
0
def test():
    """Test for methods in this file"""
    import os
    pd.set_option('display.width', 1000)
    from bat.dataframe_to_parquet import parquet_to_df
    from bat.log_to_dataframe import LogToDataFrame
    from bat.utils import file_utils
    import tempfile

    # Grab a test file
    data_path = file_utils.relative_dir(__file__, '../data')
    log_path = os.path.join(data_path, 'dns.log')

    # Convert the log to a Pandas DataFrame
    log_to_df = LogToDataFrame()
    dns_df = log_to_df.create_dataframe(log_path)

    # Print out the head
    print(dns_df.head())

    # Create a temporary file
    filename = tempfile.NamedTemporaryFile(delete=False).name

    # Write to a parquet file
    log_to_parquet(log_path, filename)

    # Read from the parquet file
    new_dns_df = parquet_to_df(filename)

    # Remove temp file
    os.remove(filename)

    # Print out the head
    print(new_dns_df.head())

    # Make sure our conversions didn't lose type info
    # Note: This is no longer going to work
    #       See:  # See: https://issues.apache.org/jira/browse/ARROW-5379
    # assert(dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist())

    # Test an empty log (a log with header/close but no data rows)
    test_path = os.path.join(data_path, 'http_empty.log')
    filename = tempfile.NamedTemporaryFile(delete=False).name
    log_to_parquet(test_path, filename)
    parquet_to_df(filename)
    os.remove(filename)

    print('DataFrame to Parquet Tests successful!')
Esempio n. 2
0
def bro_log_to_df(file_path):
    """
    Load a Bro log in to a pandas DataFrame object.
    :param file_path: Log file
    :return: pandas DataFrame object
    """
    return LogToDataFrame(file_path)
Esempio n. 3
0
def test():
    """Test for methods in this file"""
    import os
    pd.set_option('display.width', 1000)
    from bat.dataframe_to_parquet import parquet_to_df
    from bat.log_to_dataframe import LogToDataFrame
    from bat.utils import file_utils
    import tempfile

    # Grab a test file
    data_path = file_utils.relative_dir(__file__, '../data')
    test_path = os.path.join(data_path, 'dns.log')

    # Convert the log to a Pandas DataFrame
    dns_df = LogToDataFrame(test_path)

    # Print out the head
    print(dns_df.head())

    # Create a temporary file
    filename = tempfile.NamedTemporaryFile(delete=False).name

    # Write to a parquet file
    log_to_parquet(test_path, filename)

    # Read from the parquet file
    new_dns_df = parquet_to_df(filename)

    # Remove temp file
    os.remove(filename)

    # Print out the head
    print(new_dns_df.head())

    assert (dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist())

    # Test an empty log (a log with header/close but no data rows)
    test_path = os.path.join(data_path, 'http_empty.log')
    filename = tempfile.NamedTemporaryFile(delete=False).name
    log_to_parquet(test_path, filename)
    parquet_to_df(filename)
    os.remove(filename)

    print('DataFrame to Parquet Tests successful!')
Esempio n. 4
0
def test():
    """Test for methods in this file"""
    import os
    pd.set_option('display.width', 1000)
    from bat.dataframe_to_parquet import parquet_to_df
    from bat.log_to_dataframe import LogToDataFrame
    from bat.utils import file_utils
    import tempfile

    # Grab a test file
    data_path = file_utils.relative_dir(__file__, '../data')
    test_path = os.path.join(data_path, 'dns.log')

    # Convert the log to a Pandas DataFrame
    dns_df = LogToDataFrame(test_path)
    # dns_df.reset_index(inplace=True)

    # Print out the head
    print(dns_df.head())

    # Create a temporary file
    filename = tempfile.NamedTemporaryFile(delete=False).name

    # Write to a parquet file
    log_to_parquet(test_path, filename)

    # Read from the parquet file
    new_dns_df = parquet_to_df(filename)

    # Remove temp file
    os.remove(filename)

    # Print out the head
    print(new_dns_df.head())

    # Make sure our conversions didn't lose type info
    # TODO: Uncomment this test when the following PR is fixed
    #       - TimeDelta Support: https://issues.apache.org/jira/browse/ARROW-835
    # assert(dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist())

    print('DataFrame to Parquet Tests successful!')
Esempio n. 5
0
import pandas as pd
from bat.log_to_dataframe import LogToDataFrame
import plotly.offline as py
import plotly.graph_objs as go

web = LogToDataFrame('http.log')
print(web.host.unique())
Esempio n. 6
0
    # Collect args from the command line
    parser = argparse.ArgumentParser()
    parser.add_argument('-f',
                        '--bro-log',
                        type=str,
                        help='Specify a bro log to run BroLogReader test on')
    args, commands = parser.parse_known_args()

    # Check for unknown args
    if commands:
        print('Unrecognized args: %s' % commands)
        sys.exit(1)

    # If no args just call help
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    # File may have a tilde in it
    if args.bro_log:
        args.bro_log = os.path.expanduser(args.bro_log)

        # Create a Pandas dataframe from a Bro log
        bro_df = LogToDataFrame(args.bro_log)

        # Print out the head of the dataframe
        print(bro_df.head())

        # Print out the types of the columns
        print(bro_df.dtypes)
Esempio n. 7
0
    parser.add_argument('bro_log',
                        type=str,
                        help='Specify a bro log to run BroLogReader test on')
    args, commands = parser.parse_known_args()

    # Check for unknown args
    if commands:
        print('Unrecognized args: %s' % commands)
        sys.exit(1)

    # File may have a tilde in it
    if args.bro_log:
        args.bro_log = os.path.expanduser(args.bro_log)

        # Create a Pandas dataframe from a Bro log
        log_to_df = LogToDataFrame()
        bro_df = log_to_df.create_dataframe(args.bro_log)

        # Print out the head of the dataframe
        print(bro_df.head())

        # Print out the types of the columns
        print(bro_df.dtypes)

        # Print out size and memory usage
        print('DF Shape: {:s}'.format(str(bro_df.shape)))
        print('DF Memory:')
        memory_usage = bro_df.memory_usage(deep=True)
        total = memory_usage.sum()
        for item in memory_usage.items():
            print('\t {:s}: \t{:.2f} MB'.format(item[0], item[1] / 1e6))
Esempio n. 8
0
    x = datetime.now()
    print("Starting script at {}".format(x))

    configfile_ok = 1
    pp = pprint.PrettyPrinter(indent=4)
    new_list = []

    brologfile = sys.argv[1]
    print('Analizing Bro File: ', str(brologfile))
    csvfile = str(pwd) + "/" + str(brologfile).split("/")[0] + "/" + str(
        brologfile).split("/")[-1] + "_" + str(csvfile) + ".csv"
    print("Results at: " + str(csvfile))
    print("Logs at: " + str(logfile))

    # Create a Pandas dataframe from a Bro log
    log_to_df = LogToDataFrame()
    bro_df = log_to_df.create_dataframe(brologfile)

    # Insert new labels columns at the end
    columns = len(bro_df.columns)
    bro_df.insert(int(columns), "MultiLabel", "", True)
    bro_df.insert(int(columns) + 1, "UniLabel", "", True)
    bro_df.insert(int(columns) + 2, "Label", "", True)

    # Loop the generated dataframe rows to search for key analysis patterns (such as known malicious ports)
    iterrows = bro_df.iterrows()
    information = 0
    leniterrows = os.popen('wc -l ' + str(brologfile) +
                           '| cut -d " " -f1 ').read()
    print(
        "Analyzing a total of {} this may take several minutes or hours . . .".
def pull_data(uname, passwd, local_dir="./", device="em2", date=None, export=False):
    server = config['server']
    user = uname
    password= passwd
    bro_dir = '/mnt/localraid/bro/logs'
    #get date and time. If none provided, defaults to current date
    if not date:
        cdt = dt.datetime.fromtimestamp(time.time())
    else:
        if ':' not in date: date += " 12:00:00"
        cdt = parse(date)       
    datestr = '-'.join([str(cdt.year),str(cdt.month),str(cdt.day-1)])
    cur_date = '-'.join([str(cdt.year),str(cdt.month),str(cdt.day)])
        
    sh = ShellHandler(server, user, password)
    # remove local and rmeote tmp folder, if it exists
    tmp_folder = "./tmp_{}".format(cur_date)
    sh.execute('cd {}'.format(bro_dir))
    sh.execute('rm -rf {}'.format(tmp_folder))
    os.system('rm -rf {}'.format(tmp_folder))
    # create new empty tmp folder
    sh.execute('mkdir {}'.format(tmp_folder))
    # find stats and capture_loss files
    sh.execute('touch --date "{}" /tmp/start'.format(datestr))
    stats_cmd = 'find -type f -newer /tmp/start -name "*stats*"'
    _,stat_files,_ = sh.execute(stats_cmd)
    cap_loss_cmd = 'find -type f -newer /tmp/start -name "*capture_loss*"'
    _,cl_files,_ = sh.execute(cap_loss_cmd)
    all_files = cl_files + stat_files
    all_files = [bro_dir +f[1:].strip() for f in all_files if '.log' in f]
    all_files = [re.sub('\:', '\:', f) for f in all_files]
    # add dates to filenames if necessary to uniquely identify hours from different days
    dates = []
    for l in all_files:
        dates += re.findall('''([0-9]{4}-[0-9]{2}-[0-9]{2})\/''',l)
    new_files = []
    for date, f in zip(dates,all_files):
        f_tokens = f.split('/')
        new_filename = tmp_folder + '/' + date + '-' + f_tokens[-1]
        new_files.append(new_filename)
    cp_cmd = ';'.join([' cp {} {}'.format(old,new) for old,new in zip(all_files, new_files)])
    if len(cp_cmd) > 0: 
        sh.execute(cp_cmd)       
    current_files = ['/mnt/localraid/bro/logs/current/capture_loss.log', '/mnt/localraid/bro/logs/current/stats.log']
    for file in current_files:
        print 'cp {} {}'.format(file, tmp_folder)
        sh.execute('cp {} {}'.format(file, tmp_folder))
    traffic_stats_filename = '/trafficStats_v{}_{}.txt'.format(version_number, device)
    traffic_stats_path = '/home/bea3ch/shared/trafficAnalysis' + traffic_stats_filename
    sh.execute('cp {} {}'.format(traffic_stats_path, tmp_folder))
    # compress tmp folder into a tarball and copy to local
    sh.execute('tar -cvf tarball.tar {}'.format(tmp_folder))
    sh.scp.get(r'{}/tarball.tar'.format(bro_dir), r'./')
    # remove tarball on server
    sh.execute('rm -rf {} tarball.tar'.format(tmp_folder))    
    # unzip local tarball to get tmp folder
    os.system('tar -xvf ./tarball.tar')
    local_files = os.listdir('{}'.format(tmp_folder))
    # remove local tarball
    os.system('rm ./tarball.tar')
    # unzip any remaining gz files in tmp folder
    [os.system('gunzip {}'.format('/'.join([tmp_folder,f]))) for f in local_files if '.gz' in f]
    # remove any remaining .gz files
    os.system('rm {}/*.gz'.format(tmp_folder))
    # read in capture loss files
    capture_loss_files = glob.glob('{}/*capture_loss*log'.format(tmp_folder))
    capture_loss_files.sort()
    capture_loss_df = LogToDataFrame(capture_loss_files.pop())
    for file in capture_loss_files:
        try:
            capture_loss_df.merge(LogToDataFrame(file))
        except Exception as e: 
            print 'Error loading', file + ':', e
    # reset index and convert datetimes to unix epochs
    capture_loss_df.reset_index(level=0, inplace=True)
    capture_loss_df.ts = capture_loss_df.ts.map(lambda x: (x-datetime.datetime(1970,1,1)).total_seconds())
    capture_loss_df.drop('ts_delta', axis=1, inplace=True)
    # read in bro stats files
    stats_files = glob.glob('{}/*stats*log'.format(tmp_folder))
    stats_files.sort()
    stats_df = LogToDataFrame(stats_files.pop())
    for file in stats_files:
        try:
            stats_df.merge(LogToDataFrame(file))
        except Exception as e: 
            print 'Error loading', file + ':', e
    # reset index and convert datetimes to unix epochs
    stats_df.reset_index(level=0, inplace=True)
    stats_df.ts = stats_df.ts.map(lambda x: (x-datetime.datetime(1970,1,1)).total_seconds())
    stats_df.pkt_lag = str(stats_df.pkt_lag)
    # read in trafficStats csv
    traffic_stats_df = pd.read_csv(tmp_folder + traffic_stats_filename, index_col=False)
    unique_traffic_stats_timestamps.update(traffic_stats_df.ts.unique())
    # rename [cpu0 -> cpu00], [cpu1 -> cpu01], ..., [cpu9 -> cpu09]
    rename_keys = {}
    for i in xrange(10):
        rename_keys['cpu' + str(i)] = 'cpu0' + str(i)
    traffic_stats_df = traffic_stats_df.rename(columns=rename_keys)
    
    if export:
        capture_loss_df.to_csv('capture_loss_comb_{}.csv'.format(datestr))
        stats_df.to_csv('stats_comb_{}.csv'.format(datestr))
        traffic_stats_df.to_csv('trafficStats_comb_{}.csv'.format(datestr))
    
    return sh, traffic_stats_df, capture_loss_df, stats_df