def entropy_cleaning(self, matrix, targ_limit=150): """ Entropy-cleaning of lightcurve matrix using the SVD U-matrix. Parameters: matrix (:class:`numpy.ndarray`): targ_limit (int, optional): Maximum number of targets to remove during cleaning. .. codeauthor:: Mikkel N. Lund <*****@*****.**> """ logger = logging.getLogger(__name__) # Calculate the principle components: pca = PCA(self.ncomponents, random_state=self.random_state) U, _, _ = pca._fit(matrix) ent = compute_entropy(U) logger.info('Entropy start: %s', ent) targets_removed = 0 components = np.arange(self.ncomponents) with np.errstate(invalid='ignore'): while np.any(ent < self.threshold_entropy): com = components[ent < self.threshold_entropy][0] # Remove highest relative weight target m = nanmedian(U[:, com]) s = mad_to_sigma * nanmedian(np.abs(U[:, com] - m)) dev = np.abs(U[:, com] - m) / s idx0 = np.argmax(dev) # Remove the star from the lightcurve matrix: star_no = np.ones(U.shape[0], dtype=bool) star_no[idx0] = False matrix = matrix[star_no, :] targets_removed += 1 if targets_removed >= targ_limit: break U, _, _ = pca._fit(matrix) ent = compute_entropy(U) logger.info('Entropy end: %s', ent) logger.info('Targets removed: %d', targets_removed) return matrix
def PCA_on_training_model(): file_list = interface.get_available_sha256() ex_list = np.array([ pefeatures.PEFeatureExtractor().extract(interface.fetch_file(b)) for b in file_list ]) print("all_samples: ", ex_list.shape) # nor_list = normalize(ex_list, axis=0) # nor_list = MinMaxScaler().fit_transform(ex_list) nor_list, data_min, data_max, scale_, min_ = MinMaxImp(ex_list) pca = PCA(n_components=0.99).fit(nor_list) U, S, V = pca._fit(nor_list) # dic_elements = {"n_component":pca.n_components_, "scale_":scale_, "min_":min_} dic_elements = {"n_component": pca.n_components_} np.save("pca_models/features.npy", ex_list) np.save("pca_models/nor_features.npy", nor_list) np.save("pca_models/U.npy", U) np.save("pca_models/S.npy", S) np.save("pca_models/V.npy", V) np.save("pca_models/scale.npy", scale_) np.save("pca_models/min.npy", min_) createDictCSV("pca_models/dic_elements.csv", dic_elements) print("reduced dimension: ", pca.n_components_) return ex_list, nor_list, U, S, V
def pca(X, rank=None): """ Computes the PCA of X where the observations are on the rows. Suppose X is (n x d) (n observations) and r = min(m, d, rank) then U (n x r): the scores D (r x r): the singular values V (d x r): the loadings Parameters ---------- X (numpy matrix/array): the data matrix rank (None, int): the number of PCs to compute. If None, will compute the full PCA Output ------ U, D, V """ # m = np.asarray(X.mean(axis=0)).reshape(-1) # m = X.mean(axis=0) # X_cent = X - np.outer(np.ones((X.shape[0],)), m) pca = PCA(n_components=rank, random_state=42, svd_solver='randomized') return pca._fit(X)
def learn_PCA_matrix_for_spocs_with_sklearn(spocs, desired_dimension): print('spocs in learn PCA ', spocs.shape) pca = PCA(n_components=desired_dimension) U, S, V = pca._fit(torch.t(spocs).cpu().numpy()) print('U ', U.shape) print('S ', S.shape) print('V ', V.shape) print('pca.components_.shape', pca.components_.shape) return U[:, :desired_dimension], S[:desired_dimension]
def clean_cbv(Matrix, n_components, ent_limit=-1.5, targ_limit=50): logger = logging.getLogger(__name__) # Calculate the principle components: logger.info("Doing Principle Component Analysis...") pca = PCA(n_components) U, _, _ = pca._fit(Matrix) Ent = compute_entopy(U) logger.info('Entropy start: ' + str(Ent)) targets_removed = 0 components = np.arange(n_components) with np.errstate(invalid='ignore'): while np.any(Ent < ent_limit): com = components[(Ent < ent_limit)][0] # Remove highest relative weight target m = nanmedian(U[:, com]) s = 1.46 * nanmedian(np.abs(U[:, com] - m)) dev = np.abs(U[:, com] - m) / s idx0 = np.argmax(dev) star_no = np.ones(U.shape[0], dtype=bool) star_no[idx0] = False Matrix = Matrix[star_no, :] U, _, _ = pca._fit(Matrix) targets_removed += 1 if targets_removed > targ_limit: break Ent = compute_entopy(U) logger.info('Entropy end:' + str(Ent)) logger.info('Targets removed ' + str(int(targets_removed))) return Matrix
def compute_cbvs(self, targ_limit=150): """ Main function for computing CBVs. The steps taken in the function are: #. Run :meth:`lightcurve_matrix` to obtain matrix with gap-filled, nan-removed light curves for the most correlated stars in a given cbv-area. #. Compute principal components. #. Run :meth:`entropy_cleaning` to remove significant single-star contributers based on entropy. #. Rerun SNR test on CBVs, and only retain CBVs that pass the test. #. Recalculate principal components using cleaned star list. #. Save CBVs and make diagnostics plots. Parameters: targ_limit (int, optional): Maximum number of targets to remove during entropy-cleaning. .. codeauthor:: Mikkel N. Lund <*****@*****.**> .. codeauthor:: Rasmus Handberg <*****@*****.**> """ logger = logging.getLogger(__name__) logger.info('running CBV') logger.info('------------------------------------') if 'cbv-ini' in self.hdf: logger.info( 'CBV for SECTOR=%d, CADENCE=%d, AREA=%d already calculated.', self.sector, self.cadence, self.cbv_area) return logger.info('Computing CBV for SECTOR=%d, CADENCE=%d, AREA=%d...', self.sector, self.cadence, self.cbv_area) # Extract or compute cleaned and gapfilled light curve matrix mat, indx_nancol, Ntimes = self.lightcurve_matrix() # Calculate initial CBVs logger.info('Computing %d CBVs', self.ncomponents) pca = PCA(self.ncomponents, random_state=self.random_state) U0, _, _ = pca._fit(mat) cbv0 = np.full((Ntimes, self.ncomponents), np.nan, dtype='float64') cbv0[~indx_nancol, :] = np.transpose(pca.components_) # Clean away targets that contribute significantly # as a single star to a given CBV (based on entropy) logger.info('Doing Entropy Cleaning...') mat = self.entropy_cleaning(mat, targ_limit=targ_limit) # Calculate the principle components of cleaned matrix logger.info("Doing Principle Component Analysis...") U, _, _ = pca._fit(mat) cbv = np.full((Ntimes, self.ncomponents), np.nan, dtype='float64') cbv[~indx_nancol, :] = np.transpose(pca.components_) # Signal-to-Noise test (here only for plotting) #indx_lowsnr = cbv_snr_test(cbv, self.threshold_snrtest) # Save the CBV to file: self.hdf.create_dataset('cbv-ini', data=cbv) #------------------------ PLOTS --------------------------- # Plot the "effectiveness" of each CBV: max_components = 20 n_cbv_components = np.arange(max_components, dtype=int) pca_scores = compute_scores(mat, n_cbv_components) fig0 = plt.figure(figsize=(12, 8)) ax0 = fig0.add_subplot(121) ax0.plot(n_cbv_components, pca_scores, 'b', label='PCA scores') ax0.set_xlabel('nb of components') ax0.set_ylabel('CV scores') ax0.legend(loc='lower right') ax02 = fig0.add_subplot(122) ax02.plot(np.arange(1, cbv0.shape[1] + 1), pca.explained_variance_ratio_, '.-') ax02.axvline(x=cbv.shape[1] + 0.5, ls='--', color='k') ax02.set_xlabel('CBV number') ax02.set_ylabel('Variance explained ratio') fig0.savefig( os.path.join( self.cbv_plot_folder, f'cbv-perf-s{self.sector:04d}-c{self.cadence:04d}-a{self.cbv_area:d}.png' )) plt.close(fig0) # Plot all the CBVs: fig, axes = plt.subplots(int(np.ceil(self.ncomponents / 2)), 2, figsize=(12, 16)) fig2, axes2 = plt.subplots(int(np.ceil(self.ncomponents / 2)), 2, figsize=(12, 16)) fig.subplots_adjust(wspace=0.23, hspace=0.46, left=0.08, right=0.96, top=0.94, bottom=0.055) fig2.subplots_adjust(wspace=0.23, hspace=0.46, left=0.08, right=0.96, top=0.94, bottom=0.055) for k, ax in enumerate(axes.flatten()): if k < cbv0.shape[1]: #if indx_lowsnr is not None and indx_lowsnr[k]: # col = 'c' #else: # col = 'k' ax.plot(cbv0[:, k] + 0.1, 'r-') ax.plot(cbv[:, k], ls='-', color='k') ax.set_title(f'Basis Vector {k+1:d}') for k, ax in enumerate(axes2.flatten()): if k < U0.shape[1]: ax.plot(-np.abs(U0[:, k]), 'r-') ax.plot(np.abs(U[:, k]), 'k-') ax.set_title(f'Basis Vector {k+1:d}') fig.savefig( os.path.join( self.cbv_plot_folder, f'cbvs_ini-s{self.sector:04d}-c{self.cadence:04d}-a{self.cbv_area:d}.png' )) fig2.savefig( os.path.join( self.cbv_plot_folder, f'U_cbvs-s{self.sector:04d}-c{self.cadence:04d}-a{self.cbv_area:d}.png' )) plt.close(fig) plt.close(fig2)
import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression #matplotlib inline df = pd.read_csv("epilepsy.csv") df['y'].value_counts() del df['Unnamed: 0'] y=df['y'] x=df.drop(columns=["y"]) model_lr = LogisticRegression() model_lr.fit(x,y) model_lr.score(np.array(x),np.array(y)) from sklearn.decomposition import PCA pcs = PCA(n_components=33) #Keep changing until you get evr close to 90 pcs._fit(x) pcs.explained_variance_ evr=pcs.explained_variance_ratio_ np.sum(evr) pincomp=pcs.components_ pincomp.shape scoring_matrix = pcs.transform(x) scoring_matrix model_lr_1 = LogisticRegression() model_lr_1.fit(scoring_matrix,y) model_lr_1.score(scoring_matrix,y)
def learn_pca_matrix_for_spocs_with_sklearn(spocs, desired_dimension): pca = PCA(n_components=desired_dimension) u, s, v = pca._fit(torch.t(spocs).cpu().numpy()) return u[:, :desired_dimension], s[:desired_dimension]
def do_pca(X): pca = PCA() U, S, V = pca._fit(X) X_transformed = np.dot(X - pca.mean_, pca.components_.T) return pca, X_transformed
def compute_cbvs(self, cbv_area, ent_limit=-1.5, targ_limit=150): """ Main function for computing CBVs. The steps taken in the function are: 1: run :py:func:`CBVCorrector.lc_matrix_clean` to obtain matrix with gap-filled, nan-removed light curves for the most correlated stars in a given cbv-area 2: compute principal components and remove significant single-star contributers based on entropy 3: reun SNR test on CBVs, and only retain CBVs that pass the test 4: save CBVs and make diagnostics plots Parameters: *self*: all parameters defined in class init Returns: Saves CBVs per cbv-area in ".npy" files .. codeauthor:: Mikkel N. Lund <*****@*****.**> """ logger = logging.getLogger(__name__) logger.info('running CBV') logger.info('------------------------------------') if os.path.exists( os.path.join(self.data_folder, 'cbv_ini-%s-%d.npy' % (self.datasource, cbv_area))): logger.info('CBV for area %d already calculated' % cbv_area) return else: logger.info('Computing CBV for %s area %d' % (self.datasource, cbv_area)) # Extract or compute cleaned and gapfilled light curve matrix mat0, _, indx_nancol, Ntimes = self.lc_matrix_clean(cbv_area) # Calculate initial CBVs logger.info('Computing %d CBVs' % self.ncomponents) pca0 = PCA(self.ncomponents) U0, _, _ = pca0._fit(mat0) cbv0 = np.empty((Ntimes, self.ncomponents), dtype='float64') cbv0.fill(np.nan) cbv0[~indx_nancol, :] = np.transpose(pca0.components_) logger.info( 'Cleaning matrix for CBV - remove single dominant contributions' ) # Clean away targets that contribute significantly as a single star to a given CBV (based on entropy) mat = clean_cbv(mat0, self.ncomponents, ent_limit, targ_limit) # Calculate the principle components of cleaned matrix logger.info("Doing Principle Component Analysis...") pca = PCA(self.ncomponents) U, _, _ = pca._fit(mat) cbv = np.empty((Ntimes, self.ncomponents), dtype='float64') cbv.fill(np.nan) cbv[~indx_nancol, :] = np.transpose(pca.components_) # # Signal-to-Noise test (here only for plotting) # indx_lowsnr = cbv_snr_test(cbv, self.threshold_snrtest) # Save the CBV to file: np.save( os.path.join(self.data_folder, 'cbv_ini-%s-%d.npy' % (self.datasource, cbv_area)), cbv) ####################### PLOTS ################################# # Plot the "effectiveness" of each CBV: max_components = 20 n_cbv_components = np.arange(max_components, dtype=int) pca_scores = compute_scores(mat, n_cbv_components) fig0 = plt.figure(figsize=(12, 8)) ax0 = fig0.add_subplot(121) ax0.plot(n_cbv_components, pca_scores, 'b', label='PCA scores') ax0.set_xlabel('nb of components') ax0.set_ylabel('CV scores') ax0.legend(loc='lower right') ax02 = fig0.add_subplot(122) ax02.plot(np.arange(1, cbv0.shape[1] + 1), pca.explained_variance_ratio_, '.-') ax02.axvline(x=cbv.shape[1] + 0.5, ls='--', color='k') ax02.set_xlabel('CBV number') ax02.set_ylabel('Variance explained ratio') fig0.savefig( os.path.join( self.data_folder, 'cbv-perf-%s-area%d.png' % (self.datasource, cbv_area))) plt.close(fig0) # Plot all the CBVs: fig, axes = plt.subplots(int(np.ceil(self.ncomponents / 2)), 2, figsize=(12, 16)) fig2, axes2 = plt.subplots(int(np.ceil(self.ncomponents / 2)), 2, figsize=(12, 16)) fig.subplots_adjust(wspace=0.23, hspace=0.46, left=0.08, right=0.96, top=0.94, bottom=0.055) fig2.subplots_adjust(wspace=0.23, hspace=0.46, left=0.08, right=0.96, top=0.94, bottom=0.055) for k, ax in enumerate(axes.flatten()): try: ax.plot(cbv0[:, k] + 0.1, 'r-') # if not indx_lowsnr is None: # if indx_lowsnr[k]: # col = 'c' # else: # col = 'k' # else: # col = 'k' ax.plot(cbv[:, k], ls='-', color='k') ax.set_title('Basis Vector %d' % (k + 1)) except: pass for k, ax in enumerate(axes2.flatten()): try: ax.plot(-np.abs(U0[:, k]), 'r-') ax.plot(np.abs(U[:, k]), 'k-') ax.set_title('Basis Vector %d' % (k + 1)) except: pass fig.savefig( os.path.join( self.data_folder, 'cbvs_ini-%s-area%d.png' % (self.datasource, cbv_area))) fig2.savefig( os.path.join( self.data_folder, 'U_cbvs-%s-area%d.png' % (self.datasource, cbv_area))) plt.close(fig) plt.close(fig2)
def do_pca(X): pca = PCA() pca = PCA() U, S, V = pca._fit(X) X_transformed = np.dot(X - pca.mean_, pca.components_.T) return pca, X_transformed
def DPCA_cal(x, h): # x_normed = (x - x.min(0)) / x.ptp(0) #peak to peak normalization x_normed = preprocessing.scale( x ) #Standardize a dataset along any axis #Center to the mean and component wise scale to unit variance x = x_normed x = Utility.Augmentation(x, 1, h) pca = PCA() pca.fit(x) U, S, V = pca._fit(x) p_component_threshold = 0.9 ratio_sum = 0 for i in range(len(pca.explained_variance_ratio_)): if ratio_sum > p_component_threshold: break else: ratio_sum = ratio_sum + pca.explained_variance_ratio_[i] # p_component_num = sum(pca.explained_variance_ratio_>p_component_threshold) p_component_num = i p_hat = pca.components_[:, 0:p_component_num] pi_hat = np.matmul(p_hat, np.transpose(p_hat)) x_hat = np.matmul(x, pi_hat) p_til = pca.components_[:, p_component_num:] pi_til = np.matmul(p_til, np.transpose(p_til)) x_til = np.matmul(x, pi_til) x_reconstructed = x_hat + x_til x_pc = np.matmul(x, p_hat) x_res = np.matmul(x, p_til) x_hat_reconst = Utility.AugmentReverse(x_hat, 1, h) x_til_reconst = Utility.AugmentReverse(x_til, 1, h) x_augment_reconst = x_hat_reconst + x_til_reconst plt.figure(1) plt.subplot(2, 2, 1) plt.plot(x) plt.title('original x') plt.subplot(2, 2, 2) plt.plot(x_hat) plt.title('x_hat') plt.subplot(2, 2, 3) plt.plot(x_til) plt.title('x_til') plt.subplot(2, 2, 4) plt.plot(x_reconstructed) plt.title('x_reconstructed') plt.figure(2) plt.subplot(2, 2, 1) plt.plot(x) plt.title('original x') plt.subplot(2, 2, 2) plt.plot(x_hat_reconst) plt.title('x_hat_reconst') plt.subplot(2, 2, 3) plt.plot(x_til_reconst) plt.title('x_til_reconst') plt.subplot(2, 2, 4) plt.plot(x_augment_reconst) plt.title('x_augment_reconst') plt.figure(3) plt.subplot(2, 1, 1) plt.plot(x_pc) plt.title('data in pricipal subspaces') plt.subplot(2, 1, 2) plt.plot(x_res) plt.title('data in residual subspaces') plt.show() return x_hat_reconst
def main(): logger.debug('App started') parser = argparse.ArgumentParser(description='Key processing tool') parser.add_argument('-t', '--threads', dest='threads', type=int, default=None, help='Number of threads to use for cert download') parser.add_argument('--debug', dest='debug', action='store_const', const=True, help='enables debug mode') parser.add_argument('--verbose', dest='verbose', action='store_const', const=True, help='enables verbose mode') parser.add_argument('--dump-json', dest='dump_json', action='store_const', const=True, help='dumps JSON of the filtered certificates') parser.add_argument('--dump-cert', dest='dump_cert', action='store_const', const=True, help='dumps PEM of the filtered certificates') parser.add_argument( '-f', '--filter-org', dest='filter_org', help='Filter out certificates issued with given organization - regex') parser.add_argument( '--filter-domain', dest='filter_domain', help='Filter out certificates issued for the given domain - regex') parser.add_argument('--pubs', dest='pubs', nargs=argparse.ZERO_OR_MORE, help='File with public keys (PEM)') parser.add_argument('--certs', dest='certs', nargs=argparse.ZERO_OR_MORE, help='File with certificates (PEM)') parser.add_argument('--ossl', dest='ossl', type=int, default=None, help='OpenSSL generator') parser.add_argument('--per-key-stat', dest='per_key_stat', action='store_const', const=True, help='Print prob matching for each key') parser.add_argument('--subs', dest='subs', action='store_const', const=True, help='Plot random subgroups charts') parser.add_argument('--subs-k', dest='subs_k', type=int, default=5, help='Size of the subset') parser.add_argument('--subs-n', dest='subs_n', type=int, default=1000, help='Number of subsets to sample') parser.add_argument('--pca-src', dest='pca_src', action='store_const', const=True, help='Plot PCA sampled distribution vs collected one') parser.add_argument( '--pca-src-n', dest='pca_src_n', type=int, default=10000, help='Number of subsets to sample from source distributions') parser.add_argument('--pca-src-k', dest='pca_src_k', type=int, default=3, help='Size of the subset from the source distribution') parser.add_argument('--pca-grp', dest='pca_grp', action='store_const', const=True, help='Plot PCA on the input keys (groups)') parser.add_argument('--mixture', dest='mixture', action='store_const', const=True, help='Mixture distribution on masks - sources') parser.add_argument('--distrib', dest='distrib', action='store_const', const=True, help='Plot distributions - to the PDF') parser.add_argument('--distrib-mix', dest='distribmix', action='store_const', const=True, help='Plot distributions groups mixed with sources') parser.add_argument('--key-dist', dest='plot_key_dist', action='store_const', const=True, help='Plots key mask distribution') parser.add_argument('files', nargs=argparse.ZERO_OR_MORE, default=[], help='file with ssl-dump json output') args = parser.parse_args() last_src_id = 0 src_names = [] masks_db = [] masks_src = [] cert_db = [] keys_db = [] # Input = ssl-dump output if len(args.files) > 0: # Cert Organization Filtering re_org = None if args.filter_org is None else re.compile( args.filter_org, re.IGNORECASE) # Domain filtering re_dom = None if args.filter_domain is None else re.compile( args.filter_domain, re.IGNORECASE) # Process files for fl in args.files: with open(fl, mode='r') as fh: data = fh.read() # Parse json out if '-----BEGIN JSON-----' in data: if '-----END JSON-----' not in data: raise ValueError('BEGIN JSON present but END JSON not') match = re.search( r'-----BEGIN JSON-----(.+?)-----END JSON-----', data, re.MULTILINE | re.DOTALL) if match is None: raise ValueError('Could not extract JSON') data = match.group(1) json_data = json.loads(data) for cert in json_data: org = cert['org'] if org is None: org = '' if re_org is not None and re_org.match(org) is None: if args.verbose: print('Organization filtered out %s' % org) continue if re_dom is not None: dom_match = re_dom.match(cert['cn']) is not None for alt in cert['alts']: dom_match |= re_dom.match(alt) is not None if not dom_match: if args.verbose: print('Domain filtered out %s' % cert['cn']) continue cert_db.append(cert) masks_db.append(cert['pubkey']['mask']) masks_src.append(last_src_id) src_names.append(fl) last_src_id += 1 if args.verbose: print('Certificate database size %d' % len(cert_db)) if args.dump_json: print(json.dumps(cert_db)) if args.dump_cert: for cert in cert_db: print cert['cert'] # public key list processing if args.pubs is not None: for pubf in args.pubs: with open(pubf, mode='r') as fh: data = fh.read() keys = [] for match in re.finditer( r'-----BEGIN PUBLIC KEY-----(.+?)-----END PUBLIC KEY-----', data, re.MULTILINE | re.DOTALL): key = match.group(0) keys.append(key) print('File %s keys num: %d' % (pubf, len(keys))) # pubkey -> mask for key in keys: pub = serialization.load_pem_public_key( key, utils.get_backend()) mask = keys_basic.compute_key_mask(pub.public_numbers().n) keys_db.append(pub) masks_db.append(mask) masks_src.append(last_src_id) src_names.append(pubf) last_src_id += 1 # extract public key from certificate if args.certs is not None: for certf in args.certs: with open(certf, mode='r') as fh: data = fh.read() certs = [] for match in re.finditer( r'-----BEGIN CERTIFICATE-----(.+?)-----END CERTIFICATE-----', data, re.MULTILINE | re.DOTALL): cert = match.group(0) certs.append(cert) # cert -> mask for cert in certs: x509 = utils.load_x509(str(cert)) pub = x509.public_key() mask = keys_basic.compute_key_mask(pub.public_numbers().n) keys_db.append(pub) masks_db.append(mask) masks_src.append(last_src_id) src_names.append(certf) last_src_id += 1 # generate openssl keys on the fly if args.ossl is not None: for i in range(0, args.ossl): print('Generating RSA1024 key %03d' % i) key = OpenSSL.crypto.PKey() key.generate_key(OpenSSL.crypto.TYPE_RSA, 1024) key_pem = OpenSSL.crypto.dump_privatekey( OpenSSL.crypto.FILETYPE_PEM, key) priv = serialization.load_pem_private_key(key_pem, None, utils.get_backend()) mask = keys_basic.compute_key_mask( priv.public_key().public_numbers().n) keys_db.append(priv.public_key()) masks_db.append(mask) masks_src.append(last_src_id) src_names.append('ossl-%d' % args.ossl) last_src_id += 1 # Load statistics st = key_stats.KeyStats() st.load_tables() if args.verbose: print('Source stats: ') for src in st.sources_cn: print(' %30s: %08d' % (src, st.sources_cn[src])) print('Group stats:') for grp in st.groups: print(' %30s: %02d' % (grp, st.get_group_size(grp))) # mask indices mask_map, mask_max, mask_map_x, mask_map_y, mask_map_last_x, mask_map_last_y = keys_basic.generate_pubkey_mask_indices( ) print('Max mask 1D config: [%d]' % mask_max) print('Max mask 2D config: [%d, %d]' % (mask_map_last_x, mask_map_last_y)) # masks processing part if len(masks_db) == 0: return # Simple match if args.per_key_stat: print('Per-key matching: ') for idx, mask in enumerate(masks_db): print('Key %02d, mask: %s' % (idx, mask)) res = [] for src in st.table_prob: val = st.table_prob[src][mask] res.append((src, val if val is not None else 0)) print_res(res, st) # Total key matching use_loglikelihood = True print('Fit for all keys in one distribution:') total_weights = src_total_match = comp_total_match_dict( masks_db, st, loglikelihood=use_loglikelihood) res = key_val_to_list(src_total_match) print_res(res, st, loglikelihood=use_loglikelihood) res = st.res_src_to_group(res) # bar_chart(res=res, title='Fit for all keys') # Avg + mean print('Avg + mean:') src_total_match = {} # source -> [p1, p2, p3, p4, ..., p_keynum] for src in st.table_prob: src_total_match[src] = [] for idx, mask in enumerate(masks_db): val = keys_basic.aggregate_mask(st.sources_masks_prob[src], mask) if use_loglikelihood: if total_weights[src] is not None: src_total_match[src].append(val + total_weights[src]) else: src_total_match[src].append(-9999.9) else: src_total_match[src].append(val * total_weights[src]) pass pass res = [] devs = [] for src in st.sources: m = np.mean(src_total_match[src]) s = np.std(src_total_match[src]) res.append((src, m)) devs.append(s) # Total output print_res(res, st, error=devs, loglikelihood=use_loglikelihood) # bar_chart(res=res, error=devs, title='Avg for all keys + error') # PCA on the keys - groups keys_grp_vec = [] for idx, mask in enumerate(masks_db): keys_grp_vec.append([]) for src in st.groups: keys_grp_vec[idx].append(0) for idxs, src in enumerate(st.sources): grp = st.src_to_group(src) prob = st.table_prob[src][mask] keys_grp_vec[idx][st.get_group_idx(grp)] += prob if args.pca_grp: X = np.array(keys_grp_vec) pca = PCA(n_components=2) pca.fit(X) X_transformed = pca.transform(X) print('PCA mean: %s, components: ' % pca.mean_) print(pca.components_) masks_src_np = np.array(masks_src) plt.rcdefaults() colors = matplotlib.cm.rainbow(np.linspace(0, 1, last_src_id)) for src_id in range(0, last_src_id): plt.scatter(X_transformed[masks_src_np == src_id, 0], X_transformed[masks_src_np == src_id, 1], label=src_names[src_id], color=colors[src_id], alpha=0.25, marker=',') plt.legend(loc="best", shadow=False, scatterpoints=1) plt.show() # Random subset if args.subs: masks_db_tup = [] for idx, mask in enumerate(masks_db): masks_db_tup.append((idx, mask, masks_src[idx])) # Many random subsets, top groups subs_size = args.subs_k subs_count = args.subs_n groups_cnt = {} subs_data = [] subs_data_mark = [] dsrc_num = last_src_id + 1 # Take subs_count samples fro the input masks_db, evaluate it, prepare for PCA for i in range(0, subs_count): masks = random_subset(masks_db_tup, subs_size) src_total_match = comp_total_match_dict([x[1] for x in masks], st) res = key_val_to_list(src_total_match) total = 0.0 for tup in res: total += tup[1] # data vectors for PCA tmp_data = [] for idx, tmp_src in enumerate(st.sources): val = src_total_match[tmp_src] val = long(math.floor(val * (1000.0 / total))) tmp_data.append(val) # PCA on groups. # if want PCA on sources, use subs_data.append(tmp_data) subs_data.append(tmp_data) # res_grp_val = st.res_src_to_group(zip(st.sources, tmp_data)) # subs_data.append([x[1] for x in res_grp_val]) subs_dsources = {} max_dsrc = (0, 0) for dsrc in [x[2] for x in masks]: if dsrc not in subs_dsources: subs_dsources[dsrc] = 0 subs_dsources[dsrc] += 1 for dsrc in subs_dsources: if subs_dsources[dsrc] > max_dsrc[1]: max_dsrc = (dsrc, subs_dsources[dsrc]) tmp_mark = max_dsrc[0] if max_dsrc[1] == subs_size: tmp_mark = max_dsrc[0] else: tmp_mark = last_src_id subs_data_mark.append(tmp_mark) for tup in res: src = tup[0] score = long(math.floor(tup[1] * (1000.0 / total))) if score == 0: continue grp = st.src_to_group(src) if grp not in groups_cnt: groups_cnt[grp] = score else: groups_cnt[grp] += score if src not in groups_cnt: groups_cnt[src] = score else: groups_cnt[src] += score # Equalize group sizes for grp in st.groups: grp = grp.lower() if grp in groups_cnt: groups_cnt[grp] /= float(st.get_group_size(grp)) # best group only # best_src = res[0][0] # best_grp = st.src_to_group(best_src) # if best_grp not in groups_cnt: # groups_cnt[best_grp] = 1 # else: # groups_cnt[best_grp] += 1 print('Combinations: (N, k)=(%d, %d) = %d' % (subs_count, subs_size, scipy.misc.comb(subs_count, subs_size))) sources = st.groups values = [] for source in sources: val = groups_cnt[source] if source in groups_cnt else 0 values.append(val) bar_chart(sources, values, xlabel='# of occurrences as top group (best fit)', title='Groups vs. %d random %d-subsets' % (subs_count, subs_size)) # PCA stuff X = np.array(subs_data) pca = PCA(n_components=2) pU, pS, pV = pca._fit(X) X_transformed = pca.transform(X) subs_data_mark_pca = np.array(subs_data_mark) print('Sources: ') print(st.sources) print('PCA input data shape %d x %d' % (len(subs_data), len(subs_data[0]))) print('PCA mean: \n%s \nPCA components: \n' % pca.mean_) print(pca.components_) print('PCA components x: ') for x in pca.components_[0]: print x print('\nPCA components y: ') for y in pca.components_[1]: print y # print('\nPCA U,S,V') # print(pU) # print(pS) # print(pV) colors = ['blue', 'red', 'green', 'gray', 'yellow'] plt.rcdefaults() for src_id in range(0, dsrc_num): plt.scatter(X_transformed[subs_data_mark_pca == src_id, 0], X_transformed[subs_data_mark_pca == src_id, 1], color=colors[src_id], alpha=0.5 if src_id < dsrc_num - 1 else 0.2) plt.legend(loc="best", shadow=False, scatterpoints=1) # plt.scatter([x[0] for x in X_transformed], # [x[1] for x in X_transformed], # alpha=0.5) plt.show() # PCA against defined sources with known distributions? # Creates "background distribution" we want to match to if args.pca_src: # Four axes, returned as a 2-d array plt.rcdefaults() #f, axarr = plt.subplots(len(st.sources), 1) src_k = args.pca_src_k src_n = args.pca_src_n # prepare PDF ppdf = PdfPages('test.pdf') # todo-filenae-from-set sources_to_test = st.sources[20:25] + [ x for x in st.sources if 'micro' in x.lower() ] # compute for each source src_mark_idx = len(subs_data_mark) subs_data_src = subs_data subs_data_mark_src = subs_data_mark for src_idx, source in enumerate(sources_to_test): # cur_plot = axarr[src_idx] cur_plot = plt print('Plotting PCA source %s %d/%d' % (source, src_idx + 1, len(sources_to_test))) # Extend subs_data_src with draws from the source distribution for i in range(0, src_n): masks = [] for tmpk in range(0, src_k): masks.append(st.sample_source_distrib(source)) src_total_match = comp_total_match_dict(masks, st) res = key_val_to_list(src_total_match) total = 0.0 for tup in res: total += tup[1] # data vectors for PCA tmp_data = [] for idx, tmp_src in enumerate(st.sources): val = src_total_match[tmp_src] val = long(math.floor(val * (1000.0 / total))) tmp_data.append(val) # PCA on groups. # if want PCA on sources, use subs_data.append(tmp_data) subs_data_src.append(tmp_data) subs_data_mark_src.append(src_mark_idx) # PCA stuff X = np.array(subs_data_src) pca = PCA(n_components=2) pU, pS, pV = pca._fit(X) X_transformed = pca.transform(X) subs_data_mark_pca = np.array(subs_data_mark_src) colors = ['blue', 'red', 'green', 'gray', 'yellow'] # plot input sources for src_id in range(0, dsrc_num): cur_plot.scatter( X_transformed[subs_data_mark_pca == src_id, 0], X_transformed[subs_data_mark_pca == src_id, 1], color=colors[src_id], alpha=0.5 if src_id < dsrc_num - 1 else 0.2) # plot the source stuff cur_plot.scatter( X_transformed[subs_data_mark_pca == src_mark_idx, 0], X_transformed[subs_data_mark_pca == src_mark_idx, 1], color='gray', marker='+', alpha=0.05) cur_plot.legend(loc="best", shadow=False, scatterpoints=1) cur_plot.title('Src [%s] input: %s' % (source, (', '.join(src_names)))) cur_plot.savefig(ppdf, format='pdf') cur_plot.clf() print('Finalizing PDF...') # plt.savefig(ppdf, format='pdf') ppdf.close() pass if args.distrib: # Plotting distributions for groups, to the PDF plt.rcdefaults() ppdf = PdfPages('groups_distrib.pdf') # Compute for each source range_ = st.masks range_idx = np.arange(len(st.masks)) for grp_idx, grp in enumerate(st.groups): cur_data = st.groups_masks_prob[grp] raw_data = [cur_data[x] for x in st.masks] cur_plot = plt logger.debug('Plotting distribution %02d/%02d : %s ' % (grp_idx + 1, len(st.groups), grp)) axes = cur_plot.gca() axes.set_xlim([0, len(st.masks)]) cur_plot.bar(range_idx, raw_data, linewidth=0, width=0.4) cur_plot.title('%s (%s)' % (grp, get_group_desc(grp, st))) cur_plot.savefig(ppdf, format='pdf') cur_plot.clf() # Print input data - per source max_src = max(masks_src) bars = [] for src_id in range(max_src + 1): axes = plt.gca() axes.set_xlim([0, len(st.masks)]) map_data = {} for mask in st.masks: map_data[mask] = 0.0 for mask_idx, mask in enumerate(masks_db): if masks_src[mask_idx] == src_id: map_data[mask] += 1 raw_data = [] for mask in st.masks: raw_data.append(map_data[mask]) b1 = plt.bar(range_idx, raw_data, linewidth=0, width=0.4) bars.append(b1) plt.title('Source %d' % src_id) plt.savefig(ppdf, format='pdf') plt.clf() # Group distribution + source: if args.distribmix: width = 0.25 range_idx = np.arange(len(st.masks)) # One source to the graph max_src = max(masks_src) cur_plot = plt for src_id in range(max_src + 1): bars = [] logger.debug('Plotting mix distribution src %d ' % src_id) map_data = {} for mask in st.masks: map_data[mask] = 0.0 for mask_idx, mask in enumerate(masks_db): if masks_src[mask_idx] == src_id: map_data[mask] += 1 raw_data = [] for mask in st.masks: raw_data.append(map_data[mask]) raw_data = np.array(raw_data) raw_data /= float(sum(raw_data)) for grp_idx, grp in enumerate(st.groups): logger.debug( ' - Plotting mix distribution %02d/%02d : %s ' % (grp_idx + 1, len(st.groups), grp)) # Source fig, ax = plt.subplots() b1 = ax.bar(range_idx + width, raw_data, linewidth=0, width=width, color='r') bars.append(b1) # Group cur_data2 = st.groups_masks_prob[grp] raw_data2 = [cur_data2[x] for x in st.masks] bar1 = ax.bar(range_idx, raw_data2, linewidth=0, width=width, color='b') bars.append(bar1) ax.legend(tuple([x[0] for x in bars]), tuple(['Src %d' % src_id, grp])) ax.set_xlim([0, len(st.masks)]) cur_plot.title('%s + source %d' % (grp, src_id)) cur_plot.savefig(ppdf, format='pdf') cur_plot.clf() logger.info('Finishing PDF') ppdf.close() pass if args.mixture: # http://www.pymix.org/pymix/index.php?n=PyMix.Tutorial#bayesmix # 1. Create mixture model = add discrete distributions to the package dists = [] alphabet = mixture.Alphabet(st.masks) taken_src = [] for src in st.sources: if 'openssl 1.0.2g' == src or 'microsoft .net' == src: pass else: continue print(' - Source: %s' % src) taken_src.append(src) probs = [] for m in st.masks: probs.append(st.sources_masks_prob[src][m]) d = mixture.DiscreteDistribution(len(alphabet), probs, alphabet=alphabet) dists.append(d) # 2. Create the model, for now, with even distribution among components. comp_weights = [1.0 / len(dists)] * len(dists) mmodel = mixture.MixtureModel(len(dists), comp_weights, dists) print '-' * 80 print mmodel print '-' * 80 # dump mixtures to the file mixture.writeMixture(mmodel, 'src.mix') # 3. Input data - array of input masks masks_data = [[x] for x in masks_db] data = mixture.DataSet() data.fromList(masks_data) data.internalInit(mmodel) print masks_data print data print '---------' # 4. Compute EM # if there is a distribution in the input data which has zero matching inputs, # an exception will be thrown. Later - discard such source from the input... print mmodel.modelInitialization(data, 1) print('EM start: ') ress = [] for r in range(10): mmodel.modelInitialization(data, 1) emres = mmodel.EM(data, 1000, 0.00000000000000001) ress.append(emres) emres = max(ress, key=lambda x: x[1]) # print mmodel.randMaxEM(data, 10, 40, 0.1) print emres # Plot plt.rcdefaults() # plt.plot(range(0, len(emres[0][3])), [2.71828**x for x in emres[0][3]], 'o') # plt.plot(range(0, len(emres[0][3])), emres[0][3], 'k') # plt.show() for i in range(0, 5): print('-------') for idx, src in enumerate(emres[0]): print('- i:%02d src: %02d, val: %s' % (i, idx, src[i])) colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(taken_src))) range_ = range(0, len(emres[0][0])) bars = [] for idx, src in enumerate(emres[0]): b1 = plt.bar(range_, [2.71828**x for x in src], color=colors[idx]) bars.append(b1) plt.legend(tuple(bars), tuple(taken_src)) plt.grid(True) plt.show() # for src in emres[0]: # plt.plot(range(0, len(src)), [2.71828**x for x in src], 'o') # # plt.grid(True) # # plt.show() # # # plt.scatter(mask_map_last_x, mask_map_last_y, c='red', s=scale, alpha=0.3) # # plt.legend() # plt.grid(True) # plt.show() # Chisquare for source in st.sources_masks: cn = st.sources_cn[source] # chi = chisquare() # gen = keys_basic.generate_pubkey_mask() # 2D Key plot if args.plot_key_dist: plot_key_mask_dist(masks_db, st)