def sort_dictionary(dictionary,order_by='key',order='desc'): #TODO: check keywords parameters method_name = "sort_dictionary()" logging.info("Sorting dict ...") # Check the data type as dict if not isinstance(dictionary, dict): raise MSNMError(None,"Invalid dict as param", method_name) try: # Which order? if order == 'desc': reverse_order = True else: reverse_order = False if order_by == 'key': d = OrderedDict(sorted(dictionary.items(),key=lambda t: t[0],reverse=reverse_order)) else: d = OrderedDict(sorted(dictionary.items(),key=lambda t: t[1],reverse=reverse_order)) except Exception: raise MSNMError(None,sys.exc_info()[0],method_name) logging.info("Ending sorting dict ...") return d
def sort_vector(vector, order, axis, abs_value): method_name = "sort_vector()" # Check the data type as ndarray if not isinstance(vector, np.ndarray): raise MSNMError(None,"Data is not an ndarray",method_name) try: # Absolute value? if abs_value: aux = np.abs(vector) else: aux = vector # Sorting asc aux = np.sort(aux,axis=axis) # Do sorting desc? if order == 'desc': aux = aux[::-1] except Exception: raise MSNMError(None,sys.exc_info()[0],method_name) return aux
def averageDataImputation(**kwargs): """ All missing data in X (NxM) will be replaced by their average value """ method_name = "averageDataImputation()" # Check optional parameters # Check the observation for imputation if 'obs' in kwargs: obs = kwargs['obs'] else: logging.error("There is no observation to recover") msnmerror = MSNMError(None,"There is no observation to recover",method_name) raise msnmerror # Check the calibration model if 'model' in kwargs: model = kwargs['model'] else: logging.error("There is no calibration model") msnmerror = MSNMError(None,"There is no calibration model",method_name) raise msnmerror #Doing average imputation logging.debug("Doing average based data imputation ...") rec_obs = pd.DataFrame(obs).fillna(pd.DataFrame(model.get_av())).as_matrix() return rec_obs
def run(self): method_name = "run()" logging.info("Running client thread. Thread: %s", threading.current_thread().getName()) try: # Send pack to the server client_sock = self._client_instance.send_msg_to_server( self._client_instance._packet) logging.debug("Sending packet to %s", self._client_instance._server_address) # Get response from server response = self._client_instance.recv_msg_from_server(client_sock) logging.debug("Server %s sent the response %s", self._client_instance._server_address, response._body['resp']) except CommError as ce: logging.error(ce.get_msg()) #TODO: do some method to manage the raised exception in child threads on the main thread raise MSNMError(self, ce.get_msg(), method_name)
def preprocess2Dapp(test, average, scale): """ Apply autoscaled preprocessing to ``test`` data Parameters ---------- test: numpy.ndarray [NxM] billinear data set average: numpy.ndarray [1xM] sample average to substract scale: numpy.ndarray [1xM] sample scale to divide the test Return ------ testAutoScaled: numpy.ndarray [NxM] preprocessed data. Raises ------ MSNMError General error is something is wrong Example ------- >>> from msnm.utils import datautils as tools >>> import numpy as np >>> import scipy.io as sio >>> # Original data set X >>> originalData = './datatest/data_adicov_mspc.mat' >>> # Returns a dictonary like {'variable_name':'variable_data'} >>> x = sio.loadmat(originalData) >>> data = x['X'] >>> weights = np.ones((data.shape[0],1)) >>> xcs, average, scale = tools.preprocess2D(data,2,weights) >>> # anomalous data test >>> test = data['test'] >>> # data test autoscaled >>> testcs = tools.preprocess2Dapp(test,average,scale) """ method_name = "preprocess2Dapp()" try: # mean centering testMeanCenterting = test - np.dot(np.ones((test.shape[0],1)),average) # auto-scaled testAutoScaled = testMeanCenterting / (np.dot(np.ones((test.shape[0],1)),scale)) except Exception: raise MSNMError(None,sys.exc_info()[0],method_name) return testAutoScaled
def save2json(json_contents, path_to_save): logging.info("Saving json file in %s", path_to_save) try: with open(path_to_save,'w') as f: # Save raw data f.writelines(json_contents) except IOError as ioe: logging.error("Error saving json file: %s",sys.exc_info()[1]) raise MSNMError(ioe, sys.exc_info()[1], 'save2json()')
def sort_dict(keys,values,order_by,order,abs_value): #TODO: check keywords parameters method_name = "sort_dict()" # Check the data type as dict if not isinstance(keys, list): raise MSNMError(None,"Invalid list of keys", method_name) # Check the data type as ndarray if not isinstance(values, np.ndarray): raise MSNMError(None,"Invalid values array", method_name) try: # Absolute value? if abs_value: aux = np.abs(values) else: aux = values # Which order? if order == 'desc': reverse_order = True else: reverse_order = False # Make a dict from {keys:values} d = dict(zip(keys,aux)) if order_by == 'key': d = OrderedDict(sorted(d.items(),key=lambda t: t[0],reverse=reverse_order)) else: d = OrderedDict(sorted(d.items(),key=lambda t: t[1],reverse=reverse_order)) except Exception: raise MSNMError(None,sys.exc_info()[0],method_name) return d
def zeroDataImputation(**kwargs): method_name = "zeroDataImputation()" # Check optional parameters if 'obs' in kwargs: obs = kwargs['obs'] else: logging.error("There is no observation to recover") msnmerror = MSNMError(None,"There is no observation to recover",method_name) raise msnmerror #Zero value imputation logging.debug("Doing zero based data imputation ...") rec_obs = pd.DataFrame(obs).fillna(0).as_matrix() return rec_obs
def preprocess2D(x, prep, weights): """ Data preprocessing depending on ``prep`` parameter Parameters ---------- x: numpy.ndarray [NxM] billinear data set prep: int Choose the preprocessing method: 0: no preprocessing 1: mean-centering 2: auto-scaling (default) weights: numpy.ndarray [1xM] weight applied after preprocessing. Set to a vector of 1s by defect. Return ------ xcs: numpy.ndarray [NxM] preprocessed data. average: numpy.ndarray [1xM] sample average according to the preprocessing method. scale: numpy.ndarray [1xM] sample scale according to the preprocessing method. .. todo:: weights vector is not implemented Raises ------ MSNMError General error is when something was wrong Example ------- >>> from msnm.utils import datautils as tools >>> import numpy as np >>> import scipy.io as sio >>> # Original data set X >>> originalData = './datatest/data_adicov_mspc.mat' >>> # Returns a dictonary like {'variable_name':'variable_data'} >>> x = sio.loadmat(originalData) >>> data = x['X'] >>> weights = np.ones((data.shape[0],1)) >>> xcs, average, scale = tools.preprocess2D(data,2,weights) """ method_name = "preprocess_2D()" try: if prep == 1: # mean avoiding NaN for each variable average = np.nanmean(x,axis=0)# array of M elements average = average.reshape((1,average.shape[0]))# Matrix of 1xM elements # array 1xM, being M the number of variables scale = np.ones((1,x.shape[1])) # substract the average to the data set x xcs = x - np.dot(np.ones((x.shape[0],1)),average) # TODO: do test with NaN in the data set elif prep == 2: # not a numbers and a number in X nanM = np.isnan(x) anM = 1 - nanM average = np.nanmean(x,axis=0)# array of M elements average = average.reshape((1,average.shape[0]))# Matrix of 1xM elements scale = np.nanstd(x,axis=0,ddof=1) #TODO: to ask Pepe what is this :( ind = np.nonzero(scale == 0)# # of zeroes in scale dem = 2.0*np.sum(anM[:,ind],axis=0) - 1 scale[ind] = np.sqrt(np.ones((1,np.array(ind).size)) / dem) scale = scale.reshape((1,scale.shape[0]))# Matrix of 1xM elements xcs = x - np.dot(np.ones((x.shape[0],1)),average) xcs = xcs / np.dot(np.ones((x.shape[0],1)),scale) # TODO: do test with NaN in the data set else: logging.warn("Preprocessing method %s is not available ...", prep) except Exception: raise MSNMError(None,sys.exc_info()[0],method_name) return xcs, average, scale
def preprocess2Di(x, prep, lamda, average, scale, N, weights): """ Data preprocessing applying EWMA methodology. J. Camacho, “Visualizing Big data with Compressed Score Plots: Approach and research challenges,” Chemometrics and Intelligent Laboratory Systems, vol. 135, pp. 110–125, Jul. 2014. References ---------- Visualizing Big data with Compressed Score Plots: Approach and research challenges http://www.sciencedirect.com/science/article/pii/S016974391400080X Parameters ---------- x: numpy.ndarray [NxM] billinear data set prep: int Choose the preprocessing method: 0: no preprocessing 1: mean-centering 2: auto-scaling (default) lamda: float forgetting factor [0,1] average: numpy.ndarray or scalar [1xM] (t-1) previous computed average array scale: numpy.ndarray or scalar [1xM] (t-1) previous computed scale array N: int number of observations used to compute mean and scale vectors weights: numpy.ndarray [1xM] weight applied after preprocessing. Set to a vector of 1s by defect. Return ------ xcs: numpy.ndarray [NxM] preprocessed data. average: numpy.ndarray [1xM] sample average according to the preprocessing method. scale: numpy.ndarray [1xM] sample scale according to the preprocessing method. N: int Current N after applying the forgetting factor .. todo:: weights vector is not implemented yet Raises ------ MSNMError General error is when something was wrong Example ------- >>> from msnm.utils import datautils as tools >>> import numpy as np >>> import scipy.io as sio >>> # Original data set X >>> originalData = './datatest/data_adicov_mspc.mat' >>> # Returns a dictonary like {'variable_name':'variable_data'} >>> x = sio.loadmat(originalData) >>> data = x['X'] >>> weights = np.ones((data.shape[0],1)) >>> xcs, average, scale = tools.preprocess2D(data,2,weights) """ method_name = "preprocess_2D()" logging.info("Preprocessing data dynamically for N=%s obs and lambda=%s",N,lamda) # EWMA mean update model # M_t^x = lambda * M_(t-1)^x + X_t # m_t^x = (1/N_t) * M_t^x # N_t = lambda * N_(t-1) + B_t # acc <=> M_t^x --> Current model accumulated # average <=> m_t^x --> Current model mean acc = average*N; # acc2 <=> (sigma_t^x)^2 --> Current model variability accumulated # scale <=> sigma_t^x --> Current model standard deviation acc2 = (scale**2)*np.max([N-1,0]); # Current number of real observations to compute the mean and standard # deviation N = lamda*N + x.shape[0]; try: if prep == 1:# mean centering logging.debug("EWMA mean centering") # Computes the current model mean acc = lamda*acc + np.sum(x, axis=0) average = acc/N average = average.reshape(1,x.shape[1]) # array 1xM, being M the number of variables. scale = np.ones((1,x.shape[1])) # subtract the average to the data set x xcs = x - np.dot(np.ones((x.shape[0],1)),average) # TODO: do test with NaN in the data set elif prep == 2: # auto-scaling logging.debug("EWMA auto-scaling") # Computes the current model mean acc = lamda*acc + np.sum(x, axis=0) average = acc/N; average = average.reshape(1,x.shape[1]) # subtract the average to the data set x xc = x - np.dot(np.ones((x.shape[0],1)),average) # Computes the current model standard deviation acc2 = lamda*acc2 + np.sum(xc**2,axis=0) scale = np.sqrt(acc2/(N-1)) # scale is all of zeros? if np.nonzero(scale)[0].shape[0] == 0: mS = 2 else: mS = np.min(scale[np.nonzero(scale)]) scale[np.nonzero(scale == 0)] = mS/2# use 1 by default may reduce detection of anomalous events # apply the scale scale = scale.reshape(1,x.shape[1]) xcs = xc / np.dot(np.ones((x.shape[0],1)),scale) # TODO: do test with NaN in the data set elif prep == 3: logging.debug("EWMA scaling") # Computes the current model mean average = np.zeros((1,x.shape[1])) # Computes the current model standard deviation acc2 = lamda*acc2 + np.sum(x**2,axis=0) scale = np.sqrt(acc2/(N-1)) # scale is all of zeros? if np.nonzero(scale)[0].shape[0] == 0: mS = 2 else: mS = np.min(scale[np.nonzero(scale)]) scale[np.nonzero(scale == 0)] = mS/2# use 1 by default may reduce detection of anomalous events # apply the scale scale = scale.reshape(1,x.shape[1]) xcs = x / np.dot(np.ones((x.shape[0],1)),scale) # TODO: do test with NaN in the data set else: logging.warn("The selected preprocessing method is not valid") average = np.zeros((1,x.shape[1])) scale = np.ones((1,x.shape[1])) xcs = x except Exception: logging.error("Error preprocessing the data: %s",sys.exc_info()[1]) raise MSNMError(None,sys.exc_info()[1],method_name) return xcs, average, scale, N
def launch_monitoring(self, ts): """ Once the parsing (flow parser) procedure is done, this method is in charge of to start the monitoring process Raises ------ MSNMError """ method_name = "launch_monitoring()" # Configuration config = Configure() # Get root path for creating data files rootDataPath = config.get_config()['GeneralParams']['rootPath'] obs_generated_path = rootDataPath + config.get_config()['Sensor'][ 'observation'] # path to save the complete observation joining all data sources batch_obs = config.get_config()['Sensor']['dynamiCalibration'][ 'B'] # number of observation in a batch for EWMA calibration lambda_param = config.get_config()['Sensor']['dynamiCalibration'][ 'lambda'] # fogetting parameter for EWMA calibration dyn_cal_enabled = config.get_config()['Sensor']['dynamiCalibration'][ 'enabled'] # is the dynamic calibration activated? output_generated_path = rootDataPath + config.get_config()['Sensor'][ 'output'] # path to save the Q and T statistics obtained from the previous observation missingDataMethods = config.get_config()['Sensor']['missingData'][ 'missingDataMethods'] # Missing data available methods missingDataSelectedMethod = config.get_config()['Sensor'][ 'missingData']['selected'] # Get the selected missing data method missingDataModule = config.get_config()['Sensor']['missingData'][ 'missingDataModule'] # Missing data available methods valuesFormat = config.get_config()['GeneralParams'][ 'valuesFormat'] # how the variables of the complete observation are saved logging.debug("Launch monitoring for %s ", ts) try: logging.debug("Building the observation at %s for %s sources.", ts, self._sources.keys()) # Build the observation for monitoring test = [] for i in self._sources.keys(): # Get the number of variables of source i i_variables = self.get_number_source_variables( self._sources[i], i) logging.debug("Source %s has %s variables.", i, i_variables) # Get the source output parsed file for the current i_parsed_file = self._sources[i]._files_generated[ts] logging.debug("File generated of source %s at %s: %s", i, ts, i_parsed_file) if i_parsed_file: # Load the file if self._sources[i]._type == Source.TYPE_L: # static mode? # TODO: next version #staticMode = config.get_config()['DataSources'][self._sources[i]._type][i]['staticMode']; staticMode = False if not staticMode: # online or dynamic mode i_test = np.loadtxt(i_parsed_file, comments="#", delimiter=",") else: # offline or static mode # TODO it is just a patch to remove in_npackets_verylow e in_nbytes_verylow like in matlab experiment and just for Netflow!!! # look for a more smart way to do this e.g., by configuration params i_test = np.loadtxt(i_parsed_file, comments="#", delimiter=",", usecols=range( 1, i_variables + 1 + 2)) logging.debug( "Offline mode for source %s. Observation size of %s", i, i_test.shape) mask = np.ones(i_test.shape, dtype=bool) # in_npackets_verylow index in matlab is 119 --> 119 in numpy # in_nbytes_verylow index in matlab is 129 --> 129 in numpy mask[118] = False mask[128] = False i_test = i_test[mask] logging.debug( "Offline mode for source %s. Observation size of %s after removing unuseless variables.", i, i_test.shape) elif self._sources[i]._type == Source.TYPE_R: i_test = np.loadtxt(i_parsed_file, comments="#", delimiter=",") else: logging.warn( "Source %s does not has a valid type. Type: %s", i, self._sources[i]._type) else: # Missing values are replaced with NaN values i_test = np.empty(i_variables) i_test[:] = np.nan # Test observation test = np.concatenate((test, i_test), axis=0) # 1xM array test = test.reshape((1, test.size)) # Dynamic invocation of the selected data imputation method if needed if np.isnan(test).any(): missingDataMethod = getattr( importlib.import_module(missingDataModule), missingDataMethods[missingDataSelectedMethod]) logging.debug( "Invoking %s method for data imputation for observation at %s", missingDataMethod.func_name, ts) # Calling the corresponding method test = missingDataMethod(obs=test, model=self._sensor._model) obs_generate_file = obs_generated_path + "obs_" + ts + ".dat" np.savetxt(obs_generate_file, test, fmt=valuesFormat, delimiter=",", header=str(datautils.getAllVarNames()), comments="#") logging.debug("Observation generated of %s variables at %s.", test.size, ts) # if the dynamic calibration enabled? if dyn_cal_enabled: # Increments the number of observation self._current_batch_obs = self._current_batch_obs + 1 logging.debug("obs %s added to the batch as number %s.", ts, self._current_batch_obs) # Add the observation self._batch[ts] = {} self._batch[ts]['file'] = obs_generate_file self._batch[ts]['data'] = test # Once we reached the number of batch observations, we can do the dynamic calibration if self._current_batch_obs == batch_obs: # data for calibration x = np.array([]) x = x.reshape((0, test.size)) # Build the [NxM] data for the calibration #print(self._batch.keys()) for i in self._batch.keys(): logging.debug("batch at %s -> %s", i, self._batch[i]['data'].shape) x = np.vstack((x, self._batch[i]['data'])) #print(x) #print(type(x)) # Build the model self._sensor.set_data(x) self._sensor.do_dynamic_calibration(phase=2, lv=3, lamda=lambda_param) # Reset the counter self._current_batch_obs = 0 # Removing all batch observations self._batch.clear() # Do monitoring Qst, Dst = self._sensor.do_monitoring(test) except SensorError as ese: raise MSNMError(self, ese.get_msg(), method_name) except MSNMError as emsnme: raise emsnme logging.debug("MONITORING --> UCLd: %s | Dst: %s", self._sensor.get_model().get_mspc().getUCLD(), self._sensor.get_mspc().getDst()) logging.debug("MONITORING --> UCLq: %s | Qst: %s", self._sensor.get_model().get_mspc().getUCLQ(), self._sensor.get_mspc().getQst()) # Save the generated statistics output_generated_file = output_generated_path + "output_" + ts + ".dat" header = "UCLq:" + str( self._sensor.get_model().get_mspc().getUCLQ()) + ", UCLd:" + str( self._sensor.get_model().get_mspc().getUCLD()) list_array = [ self._sensor.get_mspc().getQst(), self._sensor.get_mspc().getDst() ] statistics = np.array(list_array) statistics = statistics.reshape((1, statistics.size)) np.savetxt(output_generated_file, statistics, fmt=valuesFormat, delimiter=",", header=header, comments="#") # Gets the remote sensor addressed to send the packet remote_addresses = config.get_config()['Sensor']['remote_addresses'] # Send packets is there are someone for sending it! if remote_addresses: # Send the data packet to the corresponding sensor. dataPacket = DataPacket() # Packet sent counter increments self._packet_sent = self._packet_sent + 1 dataPacket.fill_header({ 'id': self._packet_sent, 'sid': config.get_config()['Sensor']['sid'], 'ts': dateutils.get_timestamp(), 'type': Packet.TYPE_D }) dataPacket.fill_body({ 'Q': self._sensor.get_mspc().getQst(), 'D': self._sensor.get_mspc().getDst() }) logging.debug("Remote sources to send the packet #%s: %s", self._packet_sent, remote_addresses) for i in remote_addresses.keys(): ip = remote_addresses[i]['ip'] port = remote_addresses[i]['port'] tcpClient = TCPClient() tcpClient.set_server_address((ip, port)) tcpClient.set_packet_to_send(dataPacket) TCPClientThread(tcpClient).start() return test, Qst, Dst