def init_sims(self, replace=False): """ Precompute L2-normalized vectors. If `replace` is set, forget the original vectors and only keep the normalized ones = saves lots of memory! Note that you **cannot continue training** after doing a replace. The model becomes effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`. """ if getattr(self, 'doctag_syn0norm', None) is None or replace: logger.info("precomputing L2-norms of doc weight vectors") if replace: for i in xrange(self.doctag_syn0.shape[0]): self.doctag_syn0[i, :] /= sqrt((self.doctag_syn0[i, :] ** 2).sum(-1)) self.doctag_syn0norm = self.doctag_syn0 else: if self.mapfile_path: self.doctag_syn0norm = np_memmap( self.mapfile_path+'.doctag_syn0norm', dtype=REAL, mode='w+', shape=self.doctag_syn0.shape) else: self.doctag_syn0norm = empty(self.doctag_syn0.shape, dtype=REAL) np_divide(self.doctag_syn0, sqrt((self.doctag_syn0 ** 2).sum(-1))[..., newaxis], self.doctag_syn0norm)
def polaset2np_array(self, i_dataset): """Return a list of numpy 2D array containing the selected dataset i_dataset: index of the dataset table """ integration = self.dw_io.get_integration(self.datasets[i_dataset].th) if self.correction == False: data_l = self.dw_io.get_data(self.datasets[i_dataset].th) else: data_l = self.dw_io.get_cdata(self.datasets[i_dataset].th, self.cfileh) El = np_divide((data_l[0] + 0.001).transpose(), integration).transpose() Er = np_divide((data_l[1] + 0.001).transpose(), integration).transpose() stk_q = np_divide((data_l[2] + 0.001).transpose(), integration).transpose() stk_u = np_divide((data_l[3] + 0.001).transpose(), integration).transpose() stk_i = El + Er stk_v = El - Er stk_phi = 0.5 * np_arctan(div0(stk_u, stk_q)) stk_p = stk_u * stk_u + stk_q * stk_q + stk_v * stk_v stk_p = np_sqrt(stk_p) stk_p = div0(stk_p, stk_i) stk_p[stk_p >= 100] = 0.0 #stk_p = np_array([ 0.0 if x >= 100 else x for x in stk_p ]) return [El, Er, stk_q, stk_u] #, stk_i, stk_v, stk_phi, stk_p]
def run(self): doc_model = self.doc_model docvecs = doc_model.docvecs n = len(doc_model.docvecs) #p=len(doc_model.docvecs[0]) #docvecs.init_sims() #self.doctag_syn0norm = docvecs.doctag_syn0norm self.syn0norm = empty(docvecs.doctag_syn0.shape, dtype='float32') np_divide(docvecs.doctag_syn0, sqrt((docvecs.doctag_syn0**2).sum(-1))[..., None], self.syn0norm) #nn=Parallel(n_jobs=6)(delayed(find_nn)(i,self.syn0norm) for i in range(n)) nn = [find_nn(i, self.syn0norm) for i in range(n)]
def dataset2np_array(self, i_dataset): """Return a numpy 2D array or a list of numpy 2D arrays containing the selected dataset (ALPHA) i_dataset: index of the dataset table """ integration = self.dw_io.get_integration(self.datasets[i_dataset].th) if self.correction == False: data_l = self.dw_io.get_data(self.datasets[i_dataset].th) else: data_l = self.dw_io.get_cdata(self.datasets[i_dataset].th, self.cfileh) if len(data_l) == 1: return np_divide(data_l[0].transpose(), integration).transpose() else: El = np_divide((data_l[0] + 0.001).transpose(), integration).transpose() Er = np_divide((data_l[1] + 0.001).transpose(), integration).transpose() return [El, Er]
def distrib_nn_for_cdf(self, ntss_tmp, bool_print: bool = False): """ Calculates the two indicators, average and standard deviation of the distances, necessary for the use of the CDF of the normal distribution. The computation of these indicators are described in `Scoring Message Stream Anomalies in Railway Communication Systems, L.Foulon et al., 2019, ICDMWorkshop <https://ieeexplore.ieee.org/abstract/document/8955558>`_. :param numpy.ndarray ntss_tmp: Reference sequences :param boolean bool_print: and True, Displays the nodes stats on the standard output :returns: :rtype: list(numpy.ndarray, numpy.array) """ start_time = time_time() node_list, node_list_leaf, node_leaf_ndarray_mean = self.get_list_nodes_and_barycentre( ) if bool_print: print("pretrait node --- %s seconds ---" % (time_time() - start_time)) stdout.flush() print(len(node_list), " nodes whose ", len(node_list_leaf), " leafs in tree") stdout.flush() nb_leaf = len(node_list_leaf) cdf_mean = np_zeros((nb_leaf, len(ntss_tmp))) cdf_std = np_zeros(nb_leaf) nb_ts_by_node = np_zeros(nb_leaf, dtype=np_uint32) centroid_dist = np_square(cdist(node_leaf_ndarray_mean, ntss_tmp)) for num, node in enumerate(node_list_leaf): cdf_std[node.id_numpy_leaf] = np_mean(node.std) nb_ts_by_node[node.id_numpy_leaf] = node.get_nb_sequences() dist_list = np_array([np_zeros(i) for i in nb_ts_by_node], dtype=object) # calcul distance au carre entre [barycentre et ts] du meme nœud """ TODO np.vectorize ?""" for node_nn in node_list_leaf: dist_list[node_nn.id_numpy_leaf] = cdist( [node_nn.mean], node_nn.get_sequences())[0] dist_list = np_square(dist_list) """ TODO np.vectorize ?""" for num, node in enumerate(node_list_leaf): node_id = node.id_numpy_leaf centroid_dist_tmp = centroid_dist[node_id] centroid_dist_tmp = centroid_dist_tmp.reshape( centroid_dist_tmp.shape + (1, )) centroid_dist_tmp = np_repeat(centroid_dist_tmp, nb_ts_by_node[node_id], axis=1) cdf_mean_tmp = np_add(centroid_dist_tmp, dist_list[node_id]) cdf_mean[node_id] = np_sum(cdf_mean_tmp, axis=1) del dist_list del cdf_mean_tmp del centroid_dist_tmp cdf_mean = np_divide(cdf_mean.T, nb_ts_by_node) cdf_mean = np_sqrt(cdf_mean) self.cdf_mean = cdf_mean self.cdf_std = cdf_std
def r5_dnn_image(target_dirname, chandat_obj=None, chandat_dnn_obj=None, is_saving_chandat_image=True): LOGGER.info('{}: r5: Turning chandat into upsampled envelope...'.format( target_dirname)) if chandat_obj is None: chandat_obj = loadmat(os_path_join(target_dirname, CHANDAT_FNAME)) f0 = chandat_obj['f0'] if chandat_dnn_obj is None: chandat_dnn_obj = loadmat( os_path_join(target_dirname, CHANDAT_DNN_FNAME)) chandat_dnn = chandat_dnn_obj['chandat_dnn'] beam_position_x = chandat_dnn_obj['beam_position_x'] depth = chandat_dnn_obj['depth'] if f0.ndim and f0.ndim == 2: f0 = f0[0, 0] rf_data = chandat_dnn.sum(axis=1) del chandat_dnn, chandat_dnn_obj['chandat_dnn'] # design a bandpass filter n = 4 order = n / 2 critical_frequencies = [1e6, 9e6] / (4 * f0 / 2) b, a = butter(order, critical_frequencies, btype='bandpass') # Results are correct # chandat_dnn = chandat_dnn.astype(float, copy=False) # REVIEW: necessary? rf_data_filt = filtfilt(b, a, rf_data, axis=0, padtype='odd', padlen=3 * (max(len(b), len(a)) - 1)) # Correct del a, b env = np_apply_along_axis(better_envelope, 0, rf_data_filt) # print('r5: env.shape =', env.shape) np_divide(env, env.max(), out=env) clip_to_eps(env) # np_clip(env, np_spacing(1), None, out=env) env_dB = np_zeros_like(env) np_log10(env, out=env_dB) np_multiply(env_dB, 20, out=env_dB) # Upscale lateral sampling up_scale = get_dict_from_file_json( os_path_join( target_dirname, TARGET_PARAMETERS_FNAME))[TARGET_PARAMETERS_KEY_SCALE_UPSAMPLE] up_scale_inverse = 1 / up_scale num_beams = env.shape[1] x = np_arange(1, num_beams + 1) new_x = np_arange(1, num_beams + up_scale_inverse, up_scale_inverse) # TODO: optimization: instead of doing this apply thing, can we pass in the # whole `env` and specify axis? def curried_pchip(y): return pchip(x, y)(new_x) env_up = np_apply_along_axis(curried_pchip, 1, env) # print('r5: env_up.shape =', env_up.shape) del curried_pchip, new_x, x clip_to_eps(env_up) # np_clip(env_up, np_spacing(1), None, out=env_up) env_up_dB = np_zeros_like(env_up) np_log10(env_up, out=env_up_dB) np_multiply(env_up_dB, 20, out=env_up_dB) beam_position_x_up = np_linspace(beam_position_x.min(), beam_position_x.max(), env_up_dB.shape[1]) # pylint: disable=E1101, E1136 del beam_position_x chandat_image_obj = { 'rf_data': rf_data, 'rf_data_filt': rf_data_filt, 'env': env, 'env_dB': env_dB, 'envUp': env_up, 'envUp_dB': env_up_dB, 'beam_position_x_up': beam_position_x_up, 'depth': depth, } if is_saving_chandat_image is True: chandat_image_path = os_path_join(target_dirname, CHANDAT_IMAGE_SAVE_FNAME) savemat(chandat_image_path, chandat_image_obj) LOGGER.info('{}: r5 Done'.format(target_dirname)) return chandat_image_obj