Exemple #1
0
    def readrows(self):
        """The readrows method reads simply 'combines' the rows of
           multiple files OR gunzips the file and then reads the rows
        """

        # For each file (may be just one) create a BroLogReader and use it
        for self._filepath in self._files:

            # Check if the file is zipped
            tmp = None
            if self._filepath.endswith('.gz'):
                tmp = tempfile.NamedTemporaryFile(delete=False)
                with gzip.open(self._filepath,
                               'rb') as f_in, open(tmp.name, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

                # Set the file path to the new temp file
                self._filepath = tmp.name

            # Create a BroLogReader
            reader = bro_log_reader.BroLogReader(self._filepath)
            for row in reader.readrows():
                yield row

            # Clean up any temp files
            try:
                if tmp:
                    os.remove(tmp.name)
                    print('Removed temporary file {:s}...'.format(tmp.name))
            except IOError:
                pass
Exemple #2
0
    def parseFile(self, filename, json=False):
        """ Creates a pandas dataframe from given brofile

            Parameters
            ----------
            filename : string
                Path to file to be parsed

            Returns
            -------
            result : pd.DataFrame
                Pandas dataframe containing bro log file
            """
        df = None
        if not json:
            bro_log = bro_log_reader.BroLogReader(filename)
            df = pd.DataFrame(bro_log.readrows())
        else:
            df = pd.read_json(filename, lines=True)
            #df.rename(
            #    index=str,
            #    columns={
            #        'client_header_names': 'header_values'},
            #    inplace=True)
        df['header_values'] = df['header_values'].apply(
            self.__parseHeaderValues__)
        return df
Exemple #3
0
def parseLOG(filename):
    """ Generate a list of Dumont Requests from a bro log file

        Parameters
        ----------
        filename : string
            path to .pcap file to parse.

        Returns
        -------
        result : list of DumontLog()
            ordered list of dumont logs.

        """
    DumontRequests = []

    bro_log = bro_log_reader.BroLogReader(filename)
    data = pd.DataFrame(bro_log.readrows())
    data['header_values'] = data['header_values'].apply(__parseHeaderValues__)

    for d in data.iterrows():
        if d[1]['method'] == 'GET' or d[1]['method'] == 'POST':
            DumontRequests.append(DumontLog(d[1]))

    return aggregateTemporalFeatures(DumontRequests)
Exemple #4
0
    def create_dataframe(self,
                         log_filename,
                         ts_index=True,
                         aggressive_category=True,
                         usecols=None):
        """ Create a Pandas dataframe from a Bro/Zeek log file
            Args:
               log_fllename (string): The full path to the Zeek log
               ts_index (bool): Set the index to the 'ts' field (default = True)
               aggressive_category (bool): convert unknown columns to category (default = True)
        """

        # Create a Zeek log reader just to read in the header for names and types
        _bro_reader = bro_log_reader.BroLogReader(log_filename)
        _, field_names, field_types, _ = _bro_reader._parse_bro_header(
            log_filename)
        header_names = field_names

        # If usecols is set then we'll subset the fields and types
        if usecols:
            # Usecols needs to include ts
            if 'ts' not in usecols:
                usecols.append('ts')
            field_types = [
                t for t, field in zip(field_types, field_names)
                if field in usecols
            ]
            field_names = [field for field in field_names if field in usecols]

        # Get the appropriate types for the Pandas Dataframe
        pandas_types = self.pd_column_types(field_names, field_types,
                                            aggressive_category)

        # Now actually read the Zeek Log using Pandas read CSV
        self._df = pd.read_csv(log_filename,
                               sep='\t',
                               names=header_names,
                               usecols=usecols,
                               dtype=pandas_types,
                               comment="#",
                               na_values='-')

        # Now we convert 'time' and 'interval' fields to datetime and timedelta respectively
        for name, bro_type in zip(field_names, field_types):
            if bro_type == 'time':
                self._df[name] = pd.to_datetime(self._df[name], unit='s')
            if bro_type == 'interval':
                self._df[name] = pd.to_timedelta(self._df[name], unit='s')

        # Set the index
        if ts_index and not self._df.empty:
            self._df.set_index('ts', inplace=True)
        return self._df
Exemple #5
0
    def create_dataframe(self, log_filename, fillna=True):
        """ Create a Spark dataframe from a Bro/Zeek log file
            Args:
               log_fllename (string): The full path to the Zeek log
               fillna (bool): Fill in NA/NaN values (default=True)
        """

        # Create a Zeek log reader just to read in the header for names and types
        _bro_reader = bro_log_reader.BroLogReader(log_filename)
        _, field_names, field_types, _ = _bro_reader._parse_bro_header(log_filename)

        # Get the appropriate types for the Spark Dataframe
        spark_schema = self.build_spark_schema(field_names, field_types)

        # Now actually read the Zeek Log using Spark read CSV
        _df = self.spark.read.csv(log_filename, schema=spark_schema, sep='\t', comment="#", nullValue='-')

        ''' Secondary processing (cleanup)
            - Fix column names with '.' in them
            - Fill in Nulls (optional)
            - timestamp convert
            - boolean convert
        '''

        # Fix column names
        ''' Note: Yes column names with '.' in them can be escaped with backticks when selecting them BUT
                  many pipeline operations will FAIL internally if the column names have a '.' in them.
        '''
        fixed_columns = list(map(lambda x: x.replace('.', '_'), _df.columns))
        _df = _df.toDF(*fixed_columns)

        # Fill in NULL values
        if fillna:
            _df = _df.na.fill(0)   # For numeric columns
            _df = _df.na.fill('-') # For string columns

        # Convert timestamp and boolean columns
        for name, f_type in zip(field_names, field_types):
            # Some field names may have '.' in them, so we create a reference name to those fields
            ref_name = name.replace('.', '_')
            if f_type == 'time':
                _df = _df.withColumn(name, _df[ref_name].cast('timestamp'))
            if f_type == 'bool':
                _df = _df.withColumn(name, when(col(ref_name) == 'T', 'true').when(col(ref_name) == 'F', 'false')
                                     .otherwise('null').cast('boolean'))

        # Return the spark dataframe
        return _df
Exemple #6
0
    def __init__(self, filepath, eps=10, max_rows=None):
        """Initialization for the LiveSimulator Class
           Args:
               eps (int): Events Per Second that the simulator will emit events (default = 10)
               max_rows (int): The maximum number of rows to generate (default = None (go forever))
        """

        # Compute EPS timer
        # Logic:
        #     - Normal distribution centered around 1.0/eps
        #     - Make sure never less than 0
        #     - Precompute 1000 deltas and then just cycle around
        self.eps_timer = itertools.cycle([
            max(0, delta) for delta in np.random.normal(
                1.0 / float(eps), .5 / float(eps), size=1000)
        ])

        # Initialize the Zeek log reader
        self.log_reader = bro_log_reader.BroLogReader(filepath, tail=False)

        # Store max_rows
        self.max_rows = max_rows
Exemple #7
0
 def _get_field_info(self, log_filename):
     """Internal Method: Use ZAT log reader to read header for names and types"""
     _bro_reader = bro_log_reader.BroLogReader(log_filename)
     _, field_names, field_types, _ = _bro_reader._parse_bro_header(log_filename)
     return field_names, field_types
Exemple #8
0
        print('This example only works with Zeek x509.log files..')
        sys.exit(1)

    # File may have a tilde in it
    if args.bro_log:
        args.bro_log = os.path.expanduser(args.bro_log)

        # Create a VirusTotal Query Class
        vtq = vt_query.VTQuery()

        # These domains may be spoofed with a certificate issued by 'Let's Encrypt'
        spoofed_domains = set(
            ['paypal', 'gmail', 'google', 'apple', 'ebay', 'amazon'])

        # Run the bro reader on the x509.log file looking for spoofed domains
        reader = bro_log_reader.BroLogReader(args.bro_log, tail=True)
        for row in reader.readrows():

            # Pull out the Certificate Issuer
            issuer = row['certificate.issuer']
            if "Let's Encrypt" in issuer:

                # Check if the certificate subject has any spoofed domains
                subject = row['certificate.subject']
                domains = subject[3:]  # Just chopping off the 'CN=' part
                if any([domain in domains for domain in spoofed_domains]):
                    print('\n<<< Suspicious Certificate Found >>>')
                    pprint(row)

                    # Make a Virus Total query with the spoofed domain (just for fun)
                    results = vtq.query_url(domain)
Exemple #9
0
if __name__ == '__main__':
    # Example to run the bro log reader on a given file

    # Collect args from the command line
    parser = argparse.ArgumentParser()
    parser.add_argument('bro_log',
                        type=str,
                        help='Specify a bro log to run BroLogReader test on')
    parser.add_argument('-t',
                        '--tail',
                        action='store_true',
                        help='Turn on log tailing')
    args, commands = parser.parse_known_args()

    # Check for unknown args
    if commands:
        print('Unrecognized args: %s' % commands)
        sys.exit(1)

    # File may have a tilde in it
    if args.bro_log:
        args.bro_log = os.path.expanduser(args.bro_log)

        # Run the bro reader on a given log file
        reader = bro_log_reader.BroLogReader(args.bro_log,
                                             tail=args.tail,
                                             strict=True)
        for row in reader.readrows():
            pprint(row)
Exemple #10
0
        except IOError:
            vtq = vt_query.VTQuery(max_cache_time=60 * 24 *
                                   7)  # One week cache

        # See our 'Risky Domains' Notebook for the analysis and
        # statistical methods used to compute this risky set of TLDs
        risky_tlds = set([
            'info', 'tk', 'xyz', 'online', 'club', 'ru', 'website', 'in', 'ws',
            'top', 'site', 'work', 'biz', 'name', 'tech', 'loan', 'win', 'pro'
        ])

        # Launch long lived process with signal catcher
        with signal_utils.signal_catcher(save_vtq):

            # Run the bro reader on the dns.log file looking for risky TLDs
            reader = bro_log_reader.BroLogReader(args.bro_log)
            for row in reader.readrows():

                # Pull out the TLD
                query = row['query']
                tld = tldextract.extract(query).suffix

                # Check if the TLD is in the risky group
                if tld in risky_tlds:
                    # Make the query with the full query
                    results = vtq.query_url(query)
                    if results.get('positives', 0) > 3:  # At least four hits
                        print('\nRisky Domain DNS Query Found')
                        print('From: {:s} To: {:s} QType: {:s} RCode: {:s}'.
                              format(row['id.orig_h'], row['id.resp_h'],
                                     row['qtype_name'], row['rcode_name']))