def transform(self, features: Features) -> Features:
     indexes_to_keep = []
     remove_indexes = []
     for i, name in enumerate(features.names):
         if name in self.features:
             indexes_to_keep.append(i)
         else:
             remove_indexes.append(i)
     filtered_data = features.as_pandas()
     filtered_data.drop(columns=filtered_data.columns[remove_indexes],
                        inplace=True)
     indexes_to_keep = set(indexes_to_keep)
     filtered_types = [
         f_type for i, f_type in enumerate(features.types)
         if i in indexes_to_keep
     ]
     filtered_features = Features.from_pandas(df=filtered_data,
                                              types=filtered_types)
     return filtered_features
 def fit_transform(self, features: Features) -> Features:
     df = features.as_pandas(copy=False)
     cat_columns = [
         features.names[i] for i, tf in enumerate(features.types)
         if tf == FeatureType.CATEGORIAL
     ]
     for name in cat_columns:
         values = set(df[name].tolist())
         logging.debug("Got values %s for feature %s: ", values, name)
         self.feature_values[name] = list(values)
     return self._transform_df(df, features)
Exemple #3
0
 def _extract_additional_features(self, flows: t.List[NetFlow]) -> Features:
     features = [self._make_flow_features(flow) for flow in flows]
     return Features(
         data=np.array(features),
         names=[
             "mean_payl_dist", "min_payl_dist", "max_payl_dist",
             "std_payl_dist"
         ],
         types=[
             FeatureType.FLOAT,
             FeatureType.FLOAT,
             FeatureType.FLOAT,
             FeatureType.FLOAT,
         ],
     )
 def _transform_df(self, df: pandas.DataFrame,
                   original_features: Features) -> Features:
     new_types = []
     for col_name, values in self.feature_values.items():
         for value in values:
             new_col_name = col_name + "_ohe_" + str(value)
             df[new_col_name] = df[col_name].apply(lambda x: 1
                                                   if x == value else 0)
             new_types.append(FeatureType.BINARY)
         df.drop(col_name, axis=1, inplace=True)
     types = [
         original_features.types[i]
         for i, n in enumerate(original_features.names)
         if n not in self.feature_values.keys()
     ] + new_types
     return Features.from_pandas(df, types)
Exemple #5
0
    def _extract_additional_features(self, flows: t.List[NetFlow]) -> Features:
        payload_features = []
        for flow in flows:
            counts = [0] * 256
            total_bytes = 0
            for _, ip in flow.packets:
                for byte in bytes(ip.data):
                    counts[byte] = counts[byte] + 1
                    total_bytes += 1
            distribution = [abs_freq / total_bytes for abs_freq in counts]
            payload_features.append(distribution)

        return Features(
            data=np.array(payload_features, ndmin=2),
            names=["freq_byte_%s" % i for i in range(256)],
            types=[FeatureType.FLOAT for _ in range(256)],
        )
 def _extract_flow_features(self, flows: t.List[NetFlow]) -> Features:
     features = []
     names, types = self._make_flow_names_types()
     for i, f in enumerate(
             tqdm(
                 flows,
                 desc="Extract statistical flow features",
                 disable=(not self.verbose),
             )):
         if FeatureSetMode.BASIC in self.modes:
             features.append([f.src_port, f.dest_port, f.protocol])
             continue
         forward_packets = f.get_packets_in_direction(
             FlowDirection.FORWARDS)
         backward_packets = f.get_packets_in_direction(
             FlowDirection.BACKWARDS)
         total = self._extract_packet_list_features(f.packets)
         forward = self._extract_packet_list_features(forward_packets)
         backward = self._extract_packet_list_features(backward_packets)
         features_row = (self._get_port_features(f.src_port) +
                         self._get_port_features(f.dest_port) +
                         [f.protocol] + total + forward + backward)
         if FeatureSetMode.WITH_IP_ADDR in self.modes:
             src_features = self._get_ip_addr_features(f.src_ip)
             dest_features = self._get_ip_addr_features(f.dest_ip)
             features_row += src_features + dest_features
         if FeatureSetMode.SUBFLOWS in self.modes:
             active_idle_features = self._extract_active_idle_features(
                 f.packets)
             subflows_forward = self._extract_subflow_features(
                 forward_packets)
             subflows_backward = self._extract_subflow_features(
                 backward_packets)
             features_row += (active_idle_features + subflows_forward +
                              subflows_backward)
         if FeatureSetMode.TCP in self.modes:
             features_row += self._make_tcp_features(
                 f, forward_packets, backward_packets)
         if FeatureSetMode.HINDSIGHT in self.modes:
             window_start = int(max(0, i - self.hindsight_window))
             last_flows = flows[window_start:i]
             features_row += self._make_hindsight_features(f, last_flows)
         features.append(features_row)
     return Features(data=np.array(features), names=names, types=types)
 def transform(self, features: Features) -> Features:
     df = features.as_pandas(copy=False)
     return self._transform_df(df, features)
 def transform(self, features: Features) -> Features:
     data = self._scaler.transform(features.data)
     names, types = self.transform_feature_type_names(features)
     features = Features(data=data, names=names, types=types)
     features.validate()
     return features