def __extract_mac_set(data: pd.DataFrame) -> set:
     # Extract macs from data and form a set
     src_macs = set(pd.unique(data.src_mac))
     dst_macs = set(pd.unique(data.dst_mac))
     macs = src_macs.union(dst_macs)
     logger.debug('Extracted mac set.')
     return macs
 def __extract_frequency_p2p(self) -> dict:
     # Extarct frequency of each content for each (src -> dst) pair
     frequency_p2p = {mac: {} for mac in self.__mac_set}
     for src, data_flow in self.__session_p2p.items():
         for dst, session in data_flow.items():
             # Extract distribution:
             # mean & variance of frequency for each second
             delta = pd.Timedelta(seconds=1)
             frequency_p2p[src][dst] = {}
             for content in self.__content_p2p[src][dst]:
                 freq_list = []
                 all_time = session.loc[session.content == content].time
                 start_time = session.iloc[0].time
                 last_time = all_time.iloc[-1]
                 last_count = 0
                 while last_time >= start_time:
                     last_time -= delta
                     count = len(all_time.loc[all_time >= last_time])
                     freq_list.append(count - last_count)
                     last_count = count
                 # Get mean and std of frequencies
                 mean = np.mean(freq_list)
                 std = np.std(freq_list)
                 frequency_p2p[src][dst][content] = (mean, std)
     logger.debug('Extracted frequencies for mac pairs.')
     return frequency_p2p
 def __extract_content_p2p(self) -> dict:
     # Extract all unique contents for each (src -> dst) pair
     content_p2p = {mac: {} for mac in self.__mac_set}
     for src, data_flow in self.__session_p2p.items():
         for dst, session in data_flow.items():
             content_p2p[src][dst] = set(pd.unique(session.content))
     logger.debug('Extracted content for mac pairs.')
     return content_p2p
 def update(self, packet: namedtuple) -> None:
     # Update history data by appendinig one entry
     src, dst = packet.src_mac, packet.dst_mac
     content = packet.content
     # Update mac set if necrssary
     if content not in self.__content_encode:
         current_idx = len(self.__content_encode)
         self.__content_encode[content] = current_idx + 1
     # Initialize slots for new mac
     for mac in (src, dst):
         if mac not in self.__mac_set:
             self.__mac_set.add(mac)
             self.__session_p2p[mac] = {}
             self.__content_p2p[mac] = {}
             self.__ngram_p2p[mac] = {}
     # Initialize mutual slots for new connection if both sides are empty
     if dst not in self.__session_p2p[src] \
             and src not in self.__session_p2p[dst]:
         self.__session_p2p[src][dst] = pd.DataFrame()
         self.__content_p2p[src][dst] = set()
         self.__ngram_p2p[src][dst] = NGram(self.__content_encode,
                                            pd.Series([]))
     # Now at least one side is not empty, we assign the symmetric side
     for a, b in [(src, dst), (dst, src)]:
         if a not in self.__session_p2p[b]:
             self.__session_p2p[b][a] = self.__session_p2p[a][b]
             self.__content_p2p[b][a] = self.__content_p2p[a][b]
             self.__ngram_p2p[b][a] = self.__ngram_p2p[a][b]
     # Add current packet information
     row = pd.Series(
         dict(zip(self.__session_p2p[src][dst].columns,
                  list(packet)[1:])))
     ext = self.__session_p2p[a][b].append(row, ignore_index=True)
     self.__session_p2p[a][b] = ext
     self.__content_p2p[a][b].add(packet.content)
     # Update n gram model
     previous = self.__session_p2p[a][b].iloc[-2].content
     self.__ngram_p2p[a][b].update(previous, content)
     # Update gauss model
     self.__update_count += 1
     if self.__update_count >= self.__update_gauss_limit:
         self.__update_count = 0
         self.__frequency_p2p = self.__extract_frequency_p2p()
         logger.debug('Gaussian parameters updated.')
     # Remove oldest entry if current history count exceeds limit
     if self.__count == self.__limit:
         # Reduce oldest connection from n-gram
         old_content_1 = self.__session_p2p[src][dst].iloc[0].content
         old_content_2 = self.__session_p2p[src][dst].iloc[1].content
         self.__ngram_p2p[src][dst].update(old_content_1,
                                           old_content_2,
                                           is_remove=True)
         # Remove oldest entry from session
         idx = self.__session_p2p[src][dst].head(1).index
         self.__session_p2p[src][dst].drop(idx, inplace=True)
         extended = self.__session_p2p[src][dst].append(packet)
         self.__session_p2p[src][dst] = extended
     logger.debug('Updated one packet.')
 def __extract_session_p2p(self, data: pd.DataFrame) -> dict:
     # Group packages by each (src -> dst) pair
     session_dict = {mac: {} for mac in self.__mac_set}
     session_list = data.groupby(['src_mac', 'dst_mac'])
     for tag, session in session_list:
         src, dst = tag
         session_dict[src][dst] = session
     logger.debug('Extracted sessions for mac pairs.')
     return session_dict
 def drop(self) -> None:
     # Re-initialize information slots
     self.__count = 0
     self.__mac_set = None
     self.__content_encode = None
     self.__content_p2p = None
     self.__session_p2p = None
     self.__ngram_p2p = None
     self.__frequency_p2p = None
     logger.debug('Dropped history data.')
 def __extract_ngram_p2p(self) -> dict:
     # Extract and build n-gram models on each mutual conversation
     ngram_dict = {mac: {} for mac in self.__mac_set}
     for src, data_flow in self.__session_p2p.items():
         for dst, session in data_flow.items():
             if dst not in ngram_dict[src]:
                 ngram = NGram(self.__content_encode, session.content)
                 # Assign symmetric sides with same reference
                 ngram_dict[src][dst] = ngram
                 ngram_dict[dst][src] = ngram
     logger.debug('Extracted bi-gram tables.')
     return ngram_dict
 def load(self, data: pd.DataFrame, clear: bool = False):
     # Load history data and extract patterns
     # Clear old history before loading current history
     if clear:
         self.drop()
     # Truncate data if count exceeds limit
     size = len(data)
     if size > self.__limit:
         logger.warning('Load last %d packets to history.' % self.__limit)
     # Extract pattern from data
     self.__mac_set = self.__extract_mac_set(data)
     self.__content_encode = self.__extract_content_encode(data)
     self.__session_p2p = self.__extract_session_p2p(data)
     self.__content_p2p = self.__extract_content_p2p()
     self.__frequency_p2p = self.__extract_frequency_p2p()
     self.__merge_session_p2p()
     self.__ngram_p2p = self.__extract_ngram_p2p()
     logger.debug('Data loaded.')
 def __merge_session_p2p(self) -> None:
     # Merge sessions for each (src <-> dst) pair into conversations
     merged_set = set()
     session_dict = self.__session_p2p
     for src, data_flow in session_dict.items():
         for dst, session in data_flow.items():
             # Not merged yet
             if (src, dst) not in merged_set:
                 mutual = session
                 if src in session_dict[dst]:
                     mutual = mutual.append(session_dict[dst][src])
                     # Sort by time and re-index
                     mutual.sort_values('time',
                                        ascending=True,
                                        inplace=True)
                     mutual.reset_index(drop=True, inplace=True)
                 # Assign symmetric sides with same reference;
                 # This might be dangerous, as we modify
                 # the dict while iterating on it
                 session_dict[src][dst] = mutual
                 session_dict[dst][src] = mutual
                 # Set dst, src to be merged
                 merged_set.add((dst, src))
     logger.debug('Merged sessions for mac pairs.')
Exemple #10
0
 def __extract_content_encode(data: pd.DataFrame) -> dict:
     # Extract all unique contents and form an encode dict
     content_list = pd.unique(data.content)
     content_dict = {con: i for i, con in enumerate(content_list)}
     logger.debug('Extracted all content encode.')
     return content_dict
Exemple #11
0
import pandas as pd
from _log import logger
from monitor import Monitor, DataManager

data = pd.read_csv('../data/CIP_1.csv', sep='\t')
data.time = pd.to_datetime(data.time, format='%Y-%m-%d %H:%M:%S.%f')
train_size = 20000
test_size = 20000
train_data = data.loc[: train_size]
test_data = data.loc[train_size: train_size + test_size]

# Load history data
manager = DataManager(limit=train_size)
manager.load(train_data, clear=True)

# Build monitor using manager
monitor = Monitor(manager)
count = [0, 0]
for row in test_data.itertuples():
    is_safe = monitor.process(row)
    count[is_safe] += 1
logger.debug(str(count) + 'trust rate:' + str(count[1] / sum(count)))