def testdtree(): tree = {} tree[0] = -1 tree[1] = 0 tree[2] = 1 n1 = mixture.ProductDistribution([ mixture.ConditionalGaussDistribution(3, [0, 1, 0], [0, -0.1, 0.1], [0.5, 0.5, 0.5], tree) ]) tree2 = {} tree2[0] = -1 tree2[1] = 0 tree2[2] = 0 n2 = mixture.ProductDistribution([ mixture.ConditionalGaussDistribution(3, [-1, 0, 1], [0, 0.1, -0.1], [0.5, 0.5, 0.5], tree2) ]) pi = [0.4, 0.6] gen = mixture.MixtureModel(2, pi, [n1, n2]) random.seed(1) data = gen.sampleDataSet(1000) print data n1 = mixture.ProductDistribution([ mixture.DependenceTreeDistribution(3, [0.1, 1.1, 0.1], [0, 0, 0], [1.0, 1.0, 1.0]) ]) n2 = mixture.ProductDistribution([ mixture.DependenceTreeDistribution(3, [-1, 0, -0.1], [0, 0, 0], [1.0, 1.0, 1.0]) ]) n1 = mixture.ProductDistribution([ mixture.ConditionalGaussDistribution(3, [0, 1, 0], [0.0, 0.1, 0.1], [0.1, 0.1, 0.1], tree) ]) n2 = mixture.ProductDistribution([ mixture.ConditionalGaussDistribution(3, [-1, 0, 1], [0.0, 0.1, 0.1], [0.1, 0.1, 0.1], tree2) ]) train = mixture.MixtureModel(2, pi, [n1, n2]) train.modelInitialization(data) train.EM(data, 100, 0.01, silent=1)
def testLymphData(): k = 5 d = 11 aux = [0] * d models = [] for i in range(k): aux1 = [0] * d aux2 = [0] * d aux3 = [0] * d models.append( mixture.ProductDistribution( [mixture.DependenceTreeDistribution(d, aux1, aux2, aux3)])) pi = [1.0] * k pi = numpy.array(pi) / k train = mixture.MixtureModel(k, pi, models) data = mixture.DataSet() data.fromFiles(['data/ltree2_2fold.txt'], ) train.modelInitialization(data) train.EM(data, 100, 0.01, silent=1)
def intialize_normal_model(ng, data): mod_ps = np.repeat(1.0 / ng, ng) if ng == 2: n1, n2 = norm_multdist(ng, data) mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2]) elif ng == 3: n1, n2, n3 = norm_multdist(ng, data) mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2, n3]) elif ng == 4: n1, n2, n3, n4 = norm_multdist(ng, data) mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2, n3, n4]) elif ng == 5: n1, n2, n3, n4, n5 = norm_multdist(ng, data) mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2, n3, n4, n5]) elif ng == 6: n1, n2, n3, n4, n5, n6 = norm_multdist(ng, data) mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2, n3, n4, n5, n6]) elif ng == 7: n1, n2, n3, n4, n5, n6, n7 = norm_multdist(ng, data) mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2, n3, n4, n5, n6, n7]) elif ng == 8: n1, n2, n3, n4, n5, n6, n7, n8 = norm_multdist(ng, data) mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2, n3, n4, n5, n6, n7, n8]) return mix_
def setUp(self): # building generating models self.DIAG = mixture.Alphabet(['.', '0', '8', '1']) A = [[0.3, 0.6, 0.1], [0.0, 0.5, 0.5], [0.4, 0.2, 0.4]] B = [[0.5, 0.2, 0.1, 0.2], [0.5, 0.4, 0.05, 0.05], [0.8, 0.1, 0.05, 0.05]] pi = [1.0, 0.0, 0.0] self.h1 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), A, B, pi) A2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]] B2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.5], [0.2, 0.2, 0.3, 0.3]] pi2 = [0.6, 0.4, 0.0] self.h2 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), A2, B2, pi2) n1 = mixture.NormalDistribution(2.5, 0.5) n2 = mixture.NormalDistribution(6.0, 0.8) mult1 = mixture.MultinomialDistribution(3, 4, [0.23, 0.26, 0.26, 0.25], alphabet=self.DIAG) mult2 = mixture.MultinomialDistribution(3, 4, [0.7, 0.1, 0.1, 0.1], alphabet=self.DIAG) c1 = mixture.ProductDistribution([n1, mult1, self.h1]) c2 = mixture.ProductDistribution([n2, mult2, self.h2]) mpi = [0.4, 0.6] self.m = mixture.MixtureModel(2, mpi, [c1, c2]) # mixture for sampling gc1 = mixture.ProductDistribution([n1, mult1]) gc2 = mixture.ProductDistribution([n2, mult2]) self.gen = mixture.MixtureModel(2, mpi, [gc1, gc2])
def testem(self): # complex DataSet with HMM sequences and scalar data dat = self.gen.sampleSet(100) # sampling hmm data seq1 = self.h1.hmm.sample(40, 10) seq2 = self.h2.hmm.sample(60, 10) seq1.merge(seq2) data = mixtureHMM.SequenceDataSet() data.fromGHMM(dat, [seq1]) data.internalInit(self.m) tA = [[0.5, 0.2, 0.3], [0.2, 0.3, 0.5], [0.1, 0.5, 0.4]] tB = [[0.2, 0.4, 0.1, 0.3], [0.5, 0.1, 0.2, 0.2], [0.4, 0.3, 0.15, 0.15]] tpi = [0.3, 0.3, 0.4] th1 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), tA, tB, tpi) tA2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]] tB2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.4], [0.2, 0.1, 0.6, 0.1]] tpi2 = [0.3, 0.4, 0.3] th2 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), tA2, tB2, tpi2) tn1 = mixture.NormalDistribution(-1.5, 1.5) tn2 = mixture.NormalDistribution(9.0, 1.2) tmult1 = mixture.MultinomialDistribution(3, 4, [0.1, 0.1, 0.55, 0.25], alphabet=self.DIAG) tmult2 = mixture.MultinomialDistribution(3, 4, [0.4, 0.3, 0.1, 0.2], alphabet=self.DIAG) tc1 = mixture.ProductDistribution([tn1, tmult1, th1]) tc2 = mixture.ProductDistribution([tn2, tmult2, th2]) tmpi = [0.7, 0.3] tm = mixture.MixtureModel(2, tmpi, [tc1, tc2]) tm.EM(data, 80, 0.1, silent=1)
def plot_fitting_progress(dummy, n_comp, mix_data, inits): plot_mix(mix_data.dataMatrix) i = n_comp data = mix_data print '# of components: {}\n'.format(i) rand_peaks = inits print rand_peaks pi = [1. / i] * i components = [dummy(p) for p in rand_peaks] m = mixture.MixtureModel(i, pi, copy.deepcopy(components)) print 'Initial: {}\n'.format(m) _, llh = m.EM(data, 40, .1) print 'Final: {}\n'.format(m) for j in range(1, 41): print 'Iter {}\n=======\n'.format(j) m = mixture.MixtureModel(i, pi, copy.deepcopy(components)) print 'Before:\n{}'.format(m) _, llh = m.EM(data, 1, .1) components = m.components pi = m.pi print 'After:\n{}'.format(m) for m in m.components: plot_N(*get_moments(m), col=(40 - j) / 60.) plt.show()
def getModel(G, p): """ Constructs a PWM MixtureModel. @param G: number of components @param p: number of positions of the binding site @return: MixtureModel object """ DNA = mixture.Alphabet(['A', 'C', 'G', 'T']) comps = [] for i in range(G): dlist = [] for j in range(p): phi = mixture.random_vector(4) dlist.append(mixture.DiscreteDistribution(4, phi, DNA)) comps.append(mixture.ProductDistribution(dlist)) pi = mixture.random_vector(G) m = mixture.MixtureModel(G, pi, comps) return m
def testsimpleem(self): # sampling hmm data seq1 = self.h1.hmm.sample(40, 10) seq2 = self.h2.hmm.sample(60, 10) seq1.merge(seq2) data = mixtureHMM.SequenceDataSet() data.fromGHMM([], [seq1]) tA = [[0.5, 0.2, 0.3], [0.2, 0.3, 0.5], [0.1, 0.5, 0.4]] tB = [[0.2, 0.4, 0.1, 0.3], [0.5, 0.1, 0.2, 0.2], [0.4, 0.3, 0.15, 0.15]] tpi = [0.3, 0.3, 0.4] th1 = mixture.ProductDistribution([ mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), tA, tB, tpi) ]) tA2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]] tB2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.4], [0.2, 0.1, 0.6, 0.1]] tpi2 = [0.3, 0.4, 0.3] th2 = mixture.ProductDistribution([ mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), tA2, tB2, tpi2) ]) mpi = [0.4, 0.6] hm = mixture.MixtureModel(2, mpi, [th1, th2]) data.internalInit(hm) hm.EM(data, 80, 0.1, silent=1)
def getBackgroundModel(p, dist=None): """ Construct background model @param p: number of positions of the binding site @param dist: background nucleotide frequencies, uniform is default @return: MixtureModel representing the background """ DNA = mixture.Alphabet(['A', 'C', 'G', 'T']) dlist = [] if dist == None: phi = [0.25] * 4 else: phi = dist for j in range(p): dlist.append(mixture.DiscreteDistribution(4, phi, DNA)) comps = [mixture.ProductDistribution(dlist)] m = mixture.MixtureModel(1, [1.0], comps) return m
def testinternalinitcomplexempty(self): # complex DataSet with HMM sequences only # sampling hmm data seq1 = self.h1.hmm.sample(40, 10) seq2 = self.h2.hmm.sample(60, 10) seq1.merge(seq2) data = mixtureHMM.SequenceDataSet() data.fromGHMM([], [seq1]) self.assertRaises(AssertionError, data.internalInit, self.m) c1 = mixture.ProductDistribution([self.h1]) c2 = mixture.ProductDistribution([self.h2]) mpi = [0.4, 0.6] hm = mixture.MixtureModel(2, mpi, [c1, c2]) data.internalInit(hm) self.assertEqual(str(data.complexFeature), '[1]') self.assertEqual(data.p, 1) self.assertEqual(data.suff_p, 1)
def gaussian_decomposition(dist, max_comp=5, max_trials=10, max_steps=40): """ Performs Gaussian decomposition. Decompose the input distribution into best-fit Gaussian components. Args: dist: input distribution in 'mcerp' format. max_comp: maximum number of Gaussian components. max_trials: maximum number of trials beforing increasing # of comp. Larger is better/slower. max_steps: maximum steps in each fitting process. Larger is better/slower. Return: mixture: a list of tuple (mu, sigma) for the Gaussian components. """ #mix = np.concatenate([np.random.normal(0, 1, [2000]), np.random.normal(6, 2, [4000]), np.random.normal(-3, 1.5, [1000])]) mix = dist._mcpts data = mixture.DataSet() data.fromArray(mix) # TODO: what to set for init std? Sweep? Or some desired value for later analytical solving? std = 1. dummy = functools.partial(mixture.NormalDistribution, sigma=std) best_llh, best_peaks, best_mixture = None, None, None for i in range(1, 1 + max_comp): logging.debug('Gaussian Decomposition Iter: {}\n'.format(i)) local_llh, local_peaks, local_mixture = None, None, None # Try max_trials times init sampling. for j in range(1, 1 + max_trials): pi = [1. / i] * i rand_peaks = np.random.choice(mix, i) components = [dummy(p) for p in rand_peaks] m = mixture.MixtureModel(i, pi, components) # Fixed convergence cretiria here. _, llh = m.EM(data, max_steps, .1, True) if local_llh is None: local_llh = llh local_peaks = rand_peaks local_mixture = m else: if llh > local_llh: local_llh = llh local_peaks = rand_peaks local_mixture = m if best_llh is None: best_llh = local_llh best_peaks = local_peaks best_mixture = local_mixture else: if local_llh > best_llh: best_llh = local_llh best_peaks = local_peaks best_mixture = local_mixture logging.debug('BEST MIXTURE ({}):\n{}'.format(best_llh, best_mixture)) # Plot the progress of fitting, cool figure awaits! #plot_fitting_progress(dummy, len(best_mixture.components), data, best_peaks) result = [] for (comp, pi) in zip(best_mixture.components, best_mixture.pi): # comp is a ProductDistribution instance which may have (not in this case) multipul components too. assert (len(comp.distList) == 1) for d in comp: result.append((pi, d.mu, d.sigma)) return result
##Peaks: histmaxes ##Peak heights: hist[histmaxes[n]] ##Stdev: stdev emdata = mixture.DataSet() emdata.fromList(data[label][call]) numpeaks = len(histmaxes) gaussian_objects = [] weights = [] for i in xrange(numpeaks): n = mixture.NormalDistribution(histmaxes[i], stdev) gaussian_objects.append(n) weights.append(hist[histmaxes[i]]) totweight = float(sum(weights)) weights = [x / totweight for x in weights] mymix = mixture.MixtureModel(numpeaks, weights, gaussian_objects) # print "Before",mymix mymix.EM(emdata, 40, 0.1) # print "After",mymix print("Number of peaks=", mymix.G) for i in range(mymix.G): print(mymix.pi[i], mymix.components[i]) summary.write(patient) summary.write("\t" + sample) summary.write("\t" + str(len(data[label][call]))) summary.write("\t" + str(call)) summary.write("\t" + label) # summary.write("\t" + str(mean*2)) # summary.write("\t" + str(stdev*2)) for i in range(mymix.G):
def main(): logger.debug('App started') parser = argparse.ArgumentParser(description='Key processing tool') parser.add_argument('-t', '--threads', dest='threads', type=int, default=None, help='Number of threads to use for cert download') parser.add_argument('--debug', dest='debug', action='store_const', const=True, help='enables debug mode') parser.add_argument('--verbose', dest='verbose', action='store_const', const=True, help='enables verbose mode') parser.add_argument('--dump-json', dest='dump_json', action='store_const', const=True, help='dumps JSON of the filtered certificates') parser.add_argument('--dump-cert', dest='dump_cert', action='store_const', const=True, help='dumps PEM of the filtered certificates') parser.add_argument( '-f', '--filter-org', dest='filter_org', help='Filter out certificates issued with given organization - regex') parser.add_argument( '--filter-domain', dest='filter_domain', help='Filter out certificates issued for the given domain - regex') parser.add_argument('--pubs', dest='pubs', nargs=argparse.ZERO_OR_MORE, help='File with public keys (PEM)') parser.add_argument('--certs', dest='certs', nargs=argparse.ZERO_OR_MORE, help='File with certificates (PEM)') parser.add_argument('--ossl', dest='ossl', type=int, default=None, help='OpenSSL generator') parser.add_argument('--per-key-stat', dest='per_key_stat', action='store_const', const=True, help='Print prob matching for each key') parser.add_argument('--subs', dest='subs', action='store_const', const=True, help='Plot random subgroups charts') parser.add_argument('--subs-k', dest='subs_k', type=int, default=5, help='Size of the subset') parser.add_argument('--subs-n', dest='subs_n', type=int, default=1000, help='Number of subsets to sample') parser.add_argument('--pca-src', dest='pca_src', action='store_const', const=True, help='Plot PCA sampled distribution vs collected one') parser.add_argument( '--pca-src-n', dest='pca_src_n', type=int, default=10000, help='Number of subsets to sample from source distributions') parser.add_argument('--pca-src-k', dest='pca_src_k', type=int, default=3, help='Size of the subset from the source distribution') parser.add_argument('--pca-grp', dest='pca_grp', action='store_const', const=True, help='Plot PCA on the input keys (groups)') parser.add_argument('--mixture', dest='mixture', action='store_const', const=True, help='Mixture distribution on masks - sources') parser.add_argument('--distrib', dest='distrib', action='store_const', const=True, help='Plot distributions - to the PDF') parser.add_argument('--distrib-mix', dest='distribmix', action='store_const', const=True, help='Plot distributions groups mixed with sources') parser.add_argument('--key-dist', dest='plot_key_dist', action='store_const', const=True, help='Plots key mask distribution') parser.add_argument('files', nargs=argparse.ZERO_OR_MORE, default=[], help='file with ssl-dump json output') args = parser.parse_args() last_src_id = 0 src_names = [] masks_db = [] masks_src = [] cert_db = [] keys_db = [] # Input = ssl-dump output if len(args.files) > 0: # Cert Organization Filtering re_org = None if args.filter_org is None else re.compile( args.filter_org, re.IGNORECASE) # Domain filtering re_dom = None if args.filter_domain is None else re.compile( args.filter_domain, re.IGNORECASE) # Process files for fl in args.files: with open(fl, mode='r') as fh: data = fh.read() # Parse json out if '-----BEGIN JSON-----' in data: if '-----END JSON-----' not in data: raise ValueError('BEGIN JSON present but END JSON not') match = re.search( r'-----BEGIN JSON-----(.+?)-----END JSON-----', data, re.MULTILINE | re.DOTALL) if match is None: raise ValueError('Could not extract JSON') data = match.group(1) json_data = json.loads(data) for cert in json_data: org = cert['org'] if org is None: org = '' if re_org is not None and re_org.match(org) is None: if args.verbose: print('Organization filtered out %s' % org) continue if re_dom is not None: dom_match = re_dom.match(cert['cn']) is not None for alt in cert['alts']: dom_match |= re_dom.match(alt) is not None if not dom_match: if args.verbose: print('Domain filtered out %s' % cert['cn']) continue cert_db.append(cert) masks_db.append(cert['pubkey']['mask']) masks_src.append(last_src_id) src_names.append(fl) last_src_id += 1 if args.verbose: print('Certificate database size %d' % len(cert_db)) if args.dump_json: print(json.dumps(cert_db)) if args.dump_cert: for cert in cert_db: print cert['cert'] # public key list processing if args.pubs is not None: for pubf in args.pubs: with open(pubf, mode='r') as fh: data = fh.read() keys = [] for match in re.finditer( r'-----BEGIN PUBLIC KEY-----(.+?)-----END PUBLIC KEY-----', data, re.MULTILINE | re.DOTALL): key = match.group(0) keys.append(key) print('File %s keys num: %d' % (pubf, len(keys))) # pubkey -> mask for key in keys: pub = serialization.load_pem_public_key( key, utils.get_backend()) mask = keys_basic.compute_key_mask(pub.public_numbers().n) keys_db.append(pub) masks_db.append(mask) masks_src.append(last_src_id) src_names.append(pubf) last_src_id += 1 # extract public key from certificate if args.certs is not None: for certf in args.certs: with open(certf, mode='r') as fh: data = fh.read() certs = [] for match in re.finditer( r'-----BEGIN CERTIFICATE-----(.+?)-----END CERTIFICATE-----', data, re.MULTILINE | re.DOTALL): cert = match.group(0) certs.append(cert) # cert -> mask for cert in certs: x509 = utils.load_x509(str(cert)) pub = x509.public_key() mask = keys_basic.compute_key_mask(pub.public_numbers().n) keys_db.append(pub) masks_db.append(mask) masks_src.append(last_src_id) src_names.append(certf) last_src_id += 1 # generate openssl keys on the fly if args.ossl is not None: for i in range(0, args.ossl): print('Generating RSA1024 key %03d' % i) key = OpenSSL.crypto.PKey() key.generate_key(OpenSSL.crypto.TYPE_RSA, 1024) key_pem = OpenSSL.crypto.dump_privatekey( OpenSSL.crypto.FILETYPE_PEM, key) priv = serialization.load_pem_private_key(key_pem, None, utils.get_backend()) mask = keys_basic.compute_key_mask( priv.public_key().public_numbers().n) keys_db.append(priv.public_key()) masks_db.append(mask) masks_src.append(last_src_id) src_names.append('ossl-%d' % args.ossl) last_src_id += 1 # Load statistics st = key_stats.KeyStats() st.load_tables() if args.verbose: print('Source stats: ') for src in st.sources_cn: print(' %30s: %08d' % (src, st.sources_cn[src])) print('Group stats:') for grp in st.groups: print(' %30s: %02d' % (grp, st.get_group_size(grp))) # mask indices mask_map, mask_max, mask_map_x, mask_map_y, mask_map_last_x, mask_map_last_y = keys_basic.generate_pubkey_mask_indices( ) print('Max mask 1D config: [%d]' % mask_max) print('Max mask 2D config: [%d, %d]' % (mask_map_last_x, mask_map_last_y)) # masks processing part if len(masks_db) == 0: return # Simple match if args.per_key_stat: print('Per-key matching: ') for idx, mask in enumerate(masks_db): print('Key %02d, mask: %s' % (idx, mask)) res = [] for src in st.table_prob: val = st.table_prob[src][mask] res.append((src, val if val is not None else 0)) print_res(res, st) # Total key matching use_loglikelihood = True print('Fit for all keys in one distribution:') total_weights = src_total_match = comp_total_match_dict( masks_db, st, loglikelihood=use_loglikelihood) res = key_val_to_list(src_total_match) print_res(res, st, loglikelihood=use_loglikelihood) res = st.res_src_to_group(res) # bar_chart(res=res, title='Fit for all keys') # Avg + mean print('Avg + mean:') src_total_match = {} # source -> [p1, p2, p3, p4, ..., p_keynum] for src in st.table_prob: src_total_match[src] = [] for idx, mask in enumerate(masks_db): val = keys_basic.aggregate_mask(st.sources_masks_prob[src], mask) if use_loglikelihood: if total_weights[src] is not None: src_total_match[src].append(val + total_weights[src]) else: src_total_match[src].append(-9999.9) else: src_total_match[src].append(val * total_weights[src]) pass pass res = [] devs = [] for src in st.sources: m = np.mean(src_total_match[src]) s = np.std(src_total_match[src]) res.append((src, m)) devs.append(s) # Total output print_res(res, st, error=devs, loglikelihood=use_loglikelihood) # bar_chart(res=res, error=devs, title='Avg for all keys + error') # PCA on the keys - groups keys_grp_vec = [] for idx, mask in enumerate(masks_db): keys_grp_vec.append([]) for src in st.groups: keys_grp_vec[idx].append(0) for idxs, src in enumerate(st.sources): grp = st.src_to_group(src) prob = st.table_prob[src][mask] keys_grp_vec[idx][st.get_group_idx(grp)] += prob if args.pca_grp: X = np.array(keys_grp_vec) pca = PCA(n_components=2) pca.fit(X) X_transformed = pca.transform(X) print('PCA mean: %s, components: ' % pca.mean_) print(pca.components_) masks_src_np = np.array(masks_src) plt.rcdefaults() colors = matplotlib.cm.rainbow(np.linspace(0, 1, last_src_id)) for src_id in range(0, last_src_id): plt.scatter(X_transformed[masks_src_np == src_id, 0], X_transformed[masks_src_np == src_id, 1], label=src_names[src_id], color=colors[src_id], alpha=0.25, marker=',') plt.legend(loc="best", shadow=False, scatterpoints=1) plt.show() # Random subset if args.subs: masks_db_tup = [] for idx, mask in enumerate(masks_db): masks_db_tup.append((idx, mask, masks_src[idx])) # Many random subsets, top groups subs_size = args.subs_k subs_count = args.subs_n groups_cnt = {} subs_data = [] subs_data_mark = [] dsrc_num = last_src_id + 1 # Take subs_count samples fro the input masks_db, evaluate it, prepare for PCA for i in range(0, subs_count): masks = random_subset(masks_db_tup, subs_size) src_total_match = comp_total_match_dict([x[1] for x in masks], st) res = key_val_to_list(src_total_match) total = 0.0 for tup in res: total += tup[1] # data vectors for PCA tmp_data = [] for idx, tmp_src in enumerate(st.sources): val = src_total_match[tmp_src] val = long(math.floor(val * (1000.0 / total))) tmp_data.append(val) # PCA on groups. # if want PCA on sources, use subs_data.append(tmp_data) subs_data.append(tmp_data) # res_grp_val = st.res_src_to_group(zip(st.sources, tmp_data)) # subs_data.append([x[1] for x in res_grp_val]) subs_dsources = {} max_dsrc = (0, 0) for dsrc in [x[2] for x in masks]: if dsrc not in subs_dsources: subs_dsources[dsrc] = 0 subs_dsources[dsrc] += 1 for dsrc in subs_dsources: if subs_dsources[dsrc] > max_dsrc[1]: max_dsrc = (dsrc, subs_dsources[dsrc]) tmp_mark = max_dsrc[0] if max_dsrc[1] == subs_size: tmp_mark = max_dsrc[0] else: tmp_mark = last_src_id subs_data_mark.append(tmp_mark) for tup in res: src = tup[0] score = long(math.floor(tup[1] * (1000.0 / total))) if score == 0: continue grp = st.src_to_group(src) if grp not in groups_cnt: groups_cnt[grp] = score else: groups_cnt[grp] += score if src not in groups_cnt: groups_cnt[src] = score else: groups_cnt[src] += score # Equalize group sizes for grp in st.groups: grp = grp.lower() if grp in groups_cnt: groups_cnt[grp] /= float(st.get_group_size(grp)) # best group only # best_src = res[0][0] # best_grp = st.src_to_group(best_src) # if best_grp not in groups_cnt: # groups_cnt[best_grp] = 1 # else: # groups_cnt[best_grp] += 1 print('Combinations: (N, k)=(%d, %d) = %d' % (subs_count, subs_size, scipy.misc.comb(subs_count, subs_size))) sources = st.groups values = [] for source in sources: val = groups_cnt[source] if source in groups_cnt else 0 values.append(val) bar_chart(sources, values, xlabel='# of occurrences as top group (best fit)', title='Groups vs. %d random %d-subsets' % (subs_count, subs_size)) # PCA stuff X = np.array(subs_data) pca = PCA(n_components=2) pU, pS, pV = pca._fit(X) X_transformed = pca.transform(X) subs_data_mark_pca = np.array(subs_data_mark) print('Sources: ') print(st.sources) print('PCA input data shape %d x %d' % (len(subs_data), len(subs_data[0]))) print('PCA mean: \n%s \nPCA components: \n' % pca.mean_) print(pca.components_) print('PCA components x: ') for x in pca.components_[0]: print x print('\nPCA components y: ') for y in pca.components_[1]: print y # print('\nPCA U,S,V') # print(pU) # print(pS) # print(pV) colors = ['blue', 'red', 'green', 'gray', 'yellow'] plt.rcdefaults() for src_id in range(0, dsrc_num): plt.scatter(X_transformed[subs_data_mark_pca == src_id, 0], X_transformed[subs_data_mark_pca == src_id, 1], color=colors[src_id], alpha=0.5 if src_id < dsrc_num - 1 else 0.2) plt.legend(loc="best", shadow=False, scatterpoints=1) # plt.scatter([x[0] for x in X_transformed], # [x[1] for x in X_transformed], # alpha=0.5) plt.show() # PCA against defined sources with known distributions? # Creates "background distribution" we want to match to if args.pca_src: # Four axes, returned as a 2-d array plt.rcdefaults() #f, axarr = plt.subplots(len(st.sources), 1) src_k = args.pca_src_k src_n = args.pca_src_n # prepare PDF ppdf = PdfPages('test.pdf') # todo-filenae-from-set sources_to_test = st.sources[20:25] + [ x for x in st.sources if 'micro' in x.lower() ] # compute for each source src_mark_idx = len(subs_data_mark) subs_data_src = subs_data subs_data_mark_src = subs_data_mark for src_idx, source in enumerate(sources_to_test): # cur_plot = axarr[src_idx] cur_plot = plt print('Plotting PCA source %s %d/%d' % (source, src_idx + 1, len(sources_to_test))) # Extend subs_data_src with draws from the source distribution for i in range(0, src_n): masks = [] for tmpk in range(0, src_k): masks.append(st.sample_source_distrib(source)) src_total_match = comp_total_match_dict(masks, st) res = key_val_to_list(src_total_match) total = 0.0 for tup in res: total += tup[1] # data vectors for PCA tmp_data = [] for idx, tmp_src in enumerate(st.sources): val = src_total_match[tmp_src] val = long(math.floor(val * (1000.0 / total))) tmp_data.append(val) # PCA on groups. # if want PCA on sources, use subs_data.append(tmp_data) subs_data_src.append(tmp_data) subs_data_mark_src.append(src_mark_idx) # PCA stuff X = np.array(subs_data_src) pca = PCA(n_components=2) pU, pS, pV = pca._fit(X) X_transformed = pca.transform(X) subs_data_mark_pca = np.array(subs_data_mark_src) colors = ['blue', 'red', 'green', 'gray', 'yellow'] # plot input sources for src_id in range(0, dsrc_num): cur_plot.scatter( X_transformed[subs_data_mark_pca == src_id, 0], X_transformed[subs_data_mark_pca == src_id, 1], color=colors[src_id], alpha=0.5 if src_id < dsrc_num - 1 else 0.2) # plot the source stuff cur_plot.scatter( X_transformed[subs_data_mark_pca == src_mark_idx, 0], X_transformed[subs_data_mark_pca == src_mark_idx, 1], color='gray', marker='+', alpha=0.05) cur_plot.legend(loc="best", shadow=False, scatterpoints=1) cur_plot.title('Src [%s] input: %s' % (source, (', '.join(src_names)))) cur_plot.savefig(ppdf, format='pdf') cur_plot.clf() print('Finalizing PDF...') # plt.savefig(ppdf, format='pdf') ppdf.close() pass if args.distrib: # Plotting distributions for groups, to the PDF plt.rcdefaults() ppdf = PdfPages('groups_distrib.pdf') # Compute for each source range_ = st.masks range_idx = np.arange(len(st.masks)) for grp_idx, grp in enumerate(st.groups): cur_data = st.groups_masks_prob[grp] raw_data = [cur_data[x] for x in st.masks] cur_plot = plt logger.debug('Plotting distribution %02d/%02d : %s ' % (grp_idx + 1, len(st.groups), grp)) axes = cur_plot.gca() axes.set_xlim([0, len(st.masks)]) cur_plot.bar(range_idx, raw_data, linewidth=0, width=0.4) cur_plot.title('%s (%s)' % (grp, get_group_desc(grp, st))) cur_plot.savefig(ppdf, format='pdf') cur_plot.clf() # Print input data - per source max_src = max(masks_src) bars = [] for src_id in range(max_src + 1): axes = plt.gca() axes.set_xlim([0, len(st.masks)]) map_data = {} for mask in st.masks: map_data[mask] = 0.0 for mask_idx, mask in enumerate(masks_db): if masks_src[mask_idx] == src_id: map_data[mask] += 1 raw_data = [] for mask in st.masks: raw_data.append(map_data[mask]) b1 = plt.bar(range_idx, raw_data, linewidth=0, width=0.4) bars.append(b1) plt.title('Source %d' % src_id) plt.savefig(ppdf, format='pdf') plt.clf() # Group distribution + source: if args.distribmix: width = 0.25 range_idx = np.arange(len(st.masks)) # One source to the graph max_src = max(masks_src) cur_plot = plt for src_id in range(max_src + 1): bars = [] logger.debug('Plotting mix distribution src %d ' % src_id) map_data = {} for mask in st.masks: map_data[mask] = 0.0 for mask_idx, mask in enumerate(masks_db): if masks_src[mask_idx] == src_id: map_data[mask] += 1 raw_data = [] for mask in st.masks: raw_data.append(map_data[mask]) raw_data = np.array(raw_data) raw_data /= float(sum(raw_data)) for grp_idx, grp in enumerate(st.groups): logger.debug( ' - Plotting mix distribution %02d/%02d : %s ' % (grp_idx + 1, len(st.groups), grp)) # Source fig, ax = plt.subplots() b1 = ax.bar(range_idx + width, raw_data, linewidth=0, width=width, color='r') bars.append(b1) # Group cur_data2 = st.groups_masks_prob[grp] raw_data2 = [cur_data2[x] for x in st.masks] bar1 = ax.bar(range_idx, raw_data2, linewidth=0, width=width, color='b') bars.append(bar1) ax.legend(tuple([x[0] for x in bars]), tuple(['Src %d' % src_id, grp])) ax.set_xlim([0, len(st.masks)]) cur_plot.title('%s + source %d' % (grp, src_id)) cur_plot.savefig(ppdf, format='pdf') cur_plot.clf() logger.info('Finishing PDF') ppdf.close() pass if args.mixture: # http://www.pymix.org/pymix/index.php?n=PyMix.Tutorial#bayesmix # 1. Create mixture model = add discrete distributions to the package dists = [] alphabet = mixture.Alphabet(st.masks) taken_src = [] for src in st.sources: if 'openssl 1.0.2g' == src or 'microsoft .net' == src: pass else: continue print(' - Source: %s' % src) taken_src.append(src) probs = [] for m in st.masks: probs.append(st.sources_masks_prob[src][m]) d = mixture.DiscreteDistribution(len(alphabet), probs, alphabet=alphabet) dists.append(d) # 2. Create the model, for now, with even distribution among components. comp_weights = [1.0 / len(dists)] * len(dists) mmodel = mixture.MixtureModel(len(dists), comp_weights, dists) print '-' * 80 print mmodel print '-' * 80 # dump mixtures to the file mixture.writeMixture(mmodel, 'src.mix') # 3. Input data - array of input masks masks_data = [[x] for x in masks_db] data = mixture.DataSet() data.fromList(masks_data) data.internalInit(mmodel) print masks_data print data print '---------' # 4. Compute EM # if there is a distribution in the input data which has zero matching inputs, # an exception will be thrown. Later - discard such source from the input... print mmodel.modelInitialization(data, 1) print('EM start: ') ress = [] for r in range(10): mmodel.modelInitialization(data, 1) emres = mmodel.EM(data, 1000, 0.00000000000000001) ress.append(emres) emres = max(ress, key=lambda x: x[1]) # print mmodel.randMaxEM(data, 10, 40, 0.1) print emres # Plot plt.rcdefaults() # plt.plot(range(0, len(emres[0][3])), [2.71828**x for x in emres[0][3]], 'o') # plt.plot(range(0, len(emres[0][3])), emres[0][3], 'k') # plt.show() for i in range(0, 5): print('-------') for idx, src in enumerate(emres[0]): print('- i:%02d src: %02d, val: %s' % (i, idx, src[i])) colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(taken_src))) range_ = range(0, len(emres[0][0])) bars = [] for idx, src in enumerate(emres[0]): b1 = plt.bar(range_, [2.71828**x for x in src], color=colors[idx]) bars.append(b1) plt.legend(tuple(bars), tuple(taken_src)) plt.grid(True) plt.show() # for src in emres[0]: # plt.plot(range(0, len(src)), [2.71828**x for x in src], 'o') # # plt.grid(True) # # plt.show() # # # plt.scatter(mask_map_last_x, mask_map_last_y, c='red', s=scale, alpha=0.3) # # plt.legend() # plt.grid(True) # plt.show() # Chisquare for source in st.sources_masks: cn = st.sources_cn[source] # chi = chisquare() # gen = keys_basic.generate_pubkey_mask() # 2D Key plot if args.plot_key_dist: plot_key_mask_dist(masks_db, st)
n1 = mixture.NormalDistribution(2.5, 0.5) n2 = mixture.NormalDistribution(6.0, 0.8) mult1 = mixture.MultinomialDistribution(3, 4, [0.23, 0.26, 0.26, 0.25], alphabet=DIAG) mult2 = mixture.MultinomialDistribution(3, 4, [0.7, 0.1, 0.1, 0.1], alphabet=DIAG) c1 = mixture.ProductDistribution([n1, mult1, h1]) c2 = mixture.ProductDistribution([n2, mult2, h2]) mpi = [0.4, 0.6] m = mixture.MixtureModel(2, mpi, [c1, c2]) #print m #print "-->",m.components[0].suff_dataRange # ----------- constructing complex DataSet ---------------- # mixture for sampling gc1 = mixture.ProductDistribution([n1, mult1]) gc2 = mixture.ProductDistribution([n2, mult2]) gen = mixture.MixtureModel(2, mpi, [gc1, gc2]) dat = gen.sampleSet(100) #print dat # sampling hmm data
n22 = mixture.NormalDistribution(-6.0, 0.5) n23 = mixture.NormalDistribution(3.0, 0.7) d24 = mixture.DiscreteDistribution(4, [0.1, 0.1, 0.4, 0.4]) c2 = mixture.ProductDistribution([n21, n22, n23, d24]) n31 = mixture.NormalDistribution(2.0, 0.5) n32 = mixture.NormalDistribution(-3.0, 0.5) n33 = mixture.NormalDistribution(3.0, 0.7) d34 = mixture.DiscreteDistribution(4, [0.4, 0.3, 0.1, 0.2]) c3 = mixture.ProductDistribution([n31, n32, n33, d34]) # creating the model pi = [0.4, 0.3, 0.3] m = mixture.MixtureModel(3, pi, [c1, c2, c3]) # sampling of the training data data = m.sampleDataSet(800) #--------------------------------------------------- # setting up the five component model we are going to train tn11 = mixture.NormalDistribution(1.0, 0.5) tn12 = mixture.NormalDistribution(2.0, 0.5) tn13 = mixture.NormalDistribution(-3.0, 0.5) td14 = mixture.DiscreteDistribution(4, [0.25] * 4) tc1 = mixture.ProductDistribution([tn11, tn12, tn13, td14])
def find_threshold(self, user_params): """Finds the thresholds for errors given the data using Gaussian Mixture Model Args: data: The data to fit Kwargs: method: Whether to us [min,median,mean] of data in each bin thresh: Threshold for find_alpha bins: Number of pieces of the data we look at plot: Whether to plot the cdf and the two alpha cutoffs Returns: A soft threshold (alpha0) and A strong threshold (alpha1) Raises: """ max_gauss_mixtures = user_params.get("max_gauss_mixtures") data = self.prob_smoothed #print data # http://www.pymix.org/pymix/index.php?n=PyMix.Tutorial # make two gaussains gaussian_one = mixture.NormalDistribution(numpy.mean(data), numpy.std(data)) gaussian_two = mixture.NormalDistribution(10.0 * numpy.mean(data), numpy.std(data)) mixture_model = mixture.MixtureModel(2, [0.99, 0.01], [gaussian_one, gaussian_two]) # print mixture_model EM_tuned = False while not EM_tuned: # make mix_data from a random 10% of the original data index_array = numpy.arange(data.size) numpy.random.shuffle(index_array) mix_data = mixture.DataSet() data_size = numpy.min((int(numpy.floor(data.size / 10.0)), 50000)) mix_data.fromArray(data[index_array[:data_size]]) try: mixture_model.randMaxEM(mix_data, max_gauss_mixtures, 40, 0.001, silent=True) EM_tuned = True except AssertionError: # pymix likes to throw assertion errors when it has small machine precision errors... print "Caught an assertion error in pymix, randomizing input and trying again" except: print "pymix failed to find mixture model, using single gaussian" gaussian_two = mixture.NormalDistribution( numpy.mean(data), numpy.std(data)) EM_tuned = True #print mixture_model # hacky, no good api access to the model components gauss_one_mean = float( str(mixture_model.components[0][0]).split('[')[1].split(',')[0]) gauss_one_std = float( str(mixture_model.components[0][0]).split(', ')[1].split(']')[0]) gauss_two_mean = float( str(mixture_model.components[1][0]).split('[')[1].split(',')[0]) gauss_two_std = float( str(mixture_model.components[1][0]).split(', ')[1].split(']')[0]) print "Gauss1: mu: %f, std: %f" % (gauss_one_mean, gauss_one_std) print "Gauss2: mu: %f, std: %f" % (gauss_two_mean, gauss_two_std) #print "Using threshold %f" % threshold # inv normal cdf if gauss_one_mean > gauss_two_mean or mixture_model.pi[1] < 0.60: self.thresh_main_mean = gauss_one_mean self.thresh_main_std = gauss_one_std else: self.thresh_main_mean = gauss_two_mean self.thresh_main_std = gauss_two_std
# iq.txt = iq and achievement test fields from pheno.txt # drd4_len.txt = drd4 vntr types, only number of repeats data.fromFiles(["iq.txt", "phys.txt", "drd4_len.txt"]) COMOR = 11 G = 8 components = [] for i in range(G): # intelligence and achivement tests as univariate normal distributions. (TEST) bd_mu = float(random.randint(3, 16)) bd_sigma = random.uniform(1.0, 8.0) missing_bd = mixture.NormalDistribution(-9999.9, 0.00001) dist_bd = mixture.NormalDistribution(bd_mu, bd_sigma) mix_bd = mixture.MixtureModel(2, [0.999, 0.001], [dist_bd, missing_bd], compFix=[0, 2]) voc_mu = float(random.randint(3, 16)) voc_sigma = random.uniform(1.0, 8.0) missing_voc = mixture.NormalDistribution(-9999.9, 0.00001) dist_voc = mixture.NormalDistribution(voc_mu, voc_sigma) mix_voc = mixture.MixtureModel(2, [0.999, 0.001], [dist_voc, missing_voc], compFix=[0, 2]) read_mu = float(random.randint(80, 120)) read_sigma = random.uniform(1.0, 28.0) missing_read = mixture.NormalDistribution(-9999.9, 0.00001) dist_read = mixture.NormalDistribution(read_mu, read_sigma) mix_read = mixture.MixtureModel(2, [0.999, 0.001], [dist_read, missing_read], compFix=[0, 2])
def getRandomMixture(G, p, KL_lower, KL_upper, dtypes='discgauss', M=4, seed=None): # if seed: # random.seed(seed) # mixture._C_mixextend.set_gsl_rng_seed(seed) # #print '*** seed=',seed # # else: # XXX debug # seed = random.randint(1,9000000) # mixture._C_mixextend.set_gsl_rng_seed(seed) # random.seed(seed) # #print '*** seed=',seed #M = 4 # Alphabet size for discrete distributions min_sigma = 0.1 # minimal std for Normal max_sigma = 1.0 # maximal std for Normal min_mu = -5.0 # minimal mean max_mu = 8.0 # maximal mean if dtypes == 'disc': featureTypes = [0] * p elif dtypes == 'gauss': featureTypes = [1] * p elif dtypes == 'discgauss': # discrete or Normal features for now, chosen uniformly # 0 discrete, 1 Normal featureTypes = [random.choice((0, 1)) for i in range(p)] else: raise TypeError #print featureTypes C = [] for j in range(p): c_j = [] for i in range(G): #print i,j if featureTypes[j] == 0: acc = 0 while acc == 0: cand = mixture.DiscreteDistribution( M, mixture.random_vector(M)) #print 'cand:',cand acc = 1 for d in c_j: KL_dist = mixture.sym_kl_dist(d, cand) if KL_dist > KL_upper or KL_dist < KL_lower: #print ' *', cand, 'rejected:', d , KL_dist acc = 0 break c_j.append(cand) elif featureTypes[j] == 1: acc = 0 while acc == 0: mu = random.uniform(min_mu, max_mu) sigma = random.uniform(min_sigma, max_sigma) cand = mixture.NormalDistribution(mu, sigma) #print 'cand:',cand acc = 1 for d in c_j: KL_dist = mixture.sym_kl_dist(d, cand) if KL_dist > KL_upper or KL_dist < KL_lower: #print ' *', cand, 'rejected:', d , KL_dist acc = 0 c_j.append(cand) else: RuntimeError C.append(c_j) # print '\n' # for cc in C: # print cc comps = [] for i in range(G): comps.append(mixture.ProductDistribution([C[j][i] for j in range(p)])) pi = get_random_pi(G, 0.1) m = mixture.MixtureModel(G, pi, comps, struct=1) m.updateFreeParams() return m
def clustering(k, feature_cols, feature_domains, header, table, seeds, result_file): best_loglike = None best_model = None # Giant random seeding loop, data = mx.DataSet() data.fromArray(table) for r in range(1): # weights = np.random.random_sample(k) # weights_norm = weights / sum(weights) weights_norm = [1.0 / k] * k components = [] for i in range(k): products = [] for j in range(table.shape[1]): col_type = prep.get_col_type(feature_cols[j], header) col_id = feature_cols[j] if col_type == 'cat': vals = feature_domains[col_id].keys() cnt_vals = len(vals) rand_dist = np.random.random_sample(cnt_vals) dist = mx.DiscreteDistribution(cnt_vals, rand_dist / sum(rand_dist), mx.Alphabet(vals)) elif col_type == 'num': min_val = feature_domains[col_id]['min'] max_val = feature_domains[col_id]['max'] # mean = random.uniform(min_val, max_val) mean = seeds[header[col_id][0]][i] stdev = (max_val - min_val) / 2.0 / k dist = mx.NormalDistribution(mean, stdev) else: sys.exit(1) products.append(dist) comp = mx.ProductDistribution(products) components.append(comp) mix_table = mx.MixtureModel(k, weights_norm, components) print mix_table #loglike = mix_table.randMaxEM(data,1,50,50) #print loglike #print mix_table if not best_loglike or loglike > best_loglike: # best_loglike = loglike best_model = copy.copy(mix_table) #data.internalInit(mix) # mix_table.modelInitialization(data) # print best_loglike # print best_model labels = best_model.classify(data, None, None, 1) ## output clustering results # count cluster sizes on sampled data f = open(result_file + '.stats', 'w') cnt = {} for l in labels: cnt[l] = 1 if l not in cnt else cnt[l] + 1 for l in cnt: f.write('%s %d %f%%\n' % (l, cnt[l], cnt[l] * 100.0 / sum(cnt.values()))) f.close() mx.writeMixture(best_model, result_file + '.model') return best_model