Beispiel #1
0
def testdtree():

    tree = {}
    tree[0] = -1
    tree[1] = 0
    tree[2] = 1

    n1 = mixture.ProductDistribution([
        mixture.ConditionalGaussDistribution(3, [0, 1, 0], [0, -0.1, 0.1],
                                             [0.5, 0.5, 0.5], tree)
    ])
    tree2 = {}
    tree2[0] = -1
    tree2[1] = 0
    tree2[2] = 0
    n2 = mixture.ProductDistribution([
        mixture.ConditionalGaussDistribution(3, [-1, 0, 1], [0, 0.1, -0.1],
                                             [0.5, 0.5, 0.5], tree2)
    ])

    pi = [0.4, 0.6]
    gen = mixture.MixtureModel(2, pi, [n1, n2])

    random.seed(1)
    data = gen.sampleDataSet(1000)

    print data

    n1 = mixture.ProductDistribution([
        mixture.DependenceTreeDistribution(3, [0.1, 1.1, 0.1], [0, 0, 0],
                                           [1.0, 1.0, 1.0])
    ])
    n2 = mixture.ProductDistribution([
        mixture.DependenceTreeDistribution(3, [-1, 0, -0.1], [0, 0, 0],
                                           [1.0, 1.0, 1.0])
    ])

    n1 = mixture.ProductDistribution([
        mixture.ConditionalGaussDistribution(3, [0, 1, 0], [0.0, 0.1, 0.1],
                                             [0.1, 0.1, 0.1], tree)
    ])
    n2 = mixture.ProductDistribution([
        mixture.ConditionalGaussDistribution(3, [-1, 0, 1], [0.0, 0.1, 0.1],
                                             [0.1, 0.1, 0.1], tree2)
    ])

    train = mixture.MixtureModel(2, pi, [n1, n2])
    train.modelInitialization(data)
    train.EM(data, 100, 0.01, silent=1)
Beispiel #2
0
def testLymphData():

    k = 5
    d = 11

    aux = [0] * d

    models = []

    for i in range(k):
        aux1 = [0] * d
        aux2 = [0] * d
        aux3 = [0] * d
        models.append(
            mixture.ProductDistribution(
                [mixture.DependenceTreeDistribution(d, aux1, aux2, aux3)]))

    pi = [1.0] * k
    pi = numpy.array(pi) / k

    train = mixture.MixtureModel(k, pi, models)

    data = mixture.DataSet()
    data.fromFiles(['data/ltree2_2fold.txt'], )

    train.modelInitialization(data)

    train.EM(data, 100, 0.01, silent=1)
Beispiel #3
0
def intialize_normal_model(ng, data):
    mod_ps = np.repeat(1.0 / ng, ng)
    if ng == 2:
        n1, n2 = norm_multdist(ng, data)
        mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2])
    elif ng == 3:
        n1, n2, n3 = norm_multdist(ng, data)
        mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2, n3])
    elif ng == 4:
        n1, n2, n3, n4 = norm_multdist(ng, data)
        mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2, n3, n4])
    elif ng == 5:
        n1, n2, n3, n4, n5 = norm_multdist(ng, data)
        mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2, n3, n4, n5])
    elif ng == 6:
        n1, n2, n3, n4, n5, n6 = norm_multdist(ng, data)
        mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2, n3, n4, n5, n6])
    elif ng == 7:
        n1, n2, n3, n4, n5, n6, n7 = norm_multdist(ng, data)
        mix_ = mixture.MixtureModel(ng, mod_ps, [n1, n2, n3, n4, n5, n6, n7])
    elif ng == 8:
        n1, n2, n3, n4, n5, n6, n7, n8 = norm_multdist(ng, data)
        mix_ = mixture.MixtureModel(ng, mod_ps,
                                    [n1, n2, n3, n4, n5, n6, n7, n8])
    return mix_
    def setUp(self):
        # building generating models
        self.DIAG = mixture.Alphabet(['.', '0', '8', '1'])

        A = [[0.3, 0.6, 0.1], [0.0, 0.5, 0.5], [0.4, 0.2, 0.4]]
        B = [[0.5, 0.2, 0.1, 0.2], [0.5, 0.4, 0.05, 0.05],
             [0.8, 0.1, 0.05, 0.05]]
        pi = [1.0, 0.0, 0.0]
        self.h1 = mixtureHMM.getHMM(
            mixtureHMM.ghmm.IntegerRange(0, 4),
            mixtureHMM.ghmm.DiscreteDistribution(
                mixtureHMM.ghmm.IntegerRange(0, 4)), A, B, pi)

        A2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]]
        B2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.5], [0.2, 0.2, 0.3, 0.3]]
        pi2 = [0.6, 0.4, 0.0]
        self.h2 = mixtureHMM.getHMM(
            mixtureHMM.ghmm.IntegerRange(0, 4),
            mixtureHMM.ghmm.DiscreteDistribution(
                mixtureHMM.ghmm.IntegerRange(0, 4)), A2, B2, pi2)

        n1 = mixture.NormalDistribution(2.5, 0.5)
        n2 = mixture.NormalDistribution(6.0, 0.8)

        mult1 = mixture.MultinomialDistribution(3,
                                                4, [0.23, 0.26, 0.26, 0.25],
                                                alphabet=self.DIAG)
        mult2 = mixture.MultinomialDistribution(3,
                                                4, [0.7, 0.1, 0.1, 0.1],
                                                alphabet=self.DIAG)

        c1 = mixture.ProductDistribution([n1, mult1, self.h1])
        c2 = mixture.ProductDistribution([n2, mult2, self.h2])

        mpi = [0.4, 0.6]
        self.m = mixture.MixtureModel(2, mpi, [c1, c2])

        # mixture for sampling
        gc1 = mixture.ProductDistribution([n1, mult1])
        gc2 = mixture.ProductDistribution([n2, mult2])
        self.gen = mixture.MixtureModel(2, mpi, [gc1, gc2])
    def testem(self):
        # complex DataSet with HMM sequences and scalar data
        dat = self.gen.sampleSet(100)

        # sampling hmm data
        seq1 = self.h1.hmm.sample(40, 10)
        seq2 = self.h2.hmm.sample(60, 10)

        seq1.merge(seq2)

        data = mixtureHMM.SequenceDataSet()
        data.fromGHMM(dat, [seq1])
        data.internalInit(self.m)

        tA = [[0.5, 0.2, 0.3], [0.2, 0.3, 0.5], [0.1, 0.5, 0.4]]
        tB = [[0.2, 0.4, 0.1, 0.3], [0.5, 0.1, 0.2, 0.2],
              [0.4, 0.3, 0.15, 0.15]]
        tpi = [0.3, 0.3, 0.4]
        th1 = mixtureHMM.getHMM(
            mixtureHMM.ghmm.IntegerRange(0, 4),
            mixtureHMM.ghmm.DiscreteDistribution(
                mixtureHMM.ghmm.IntegerRange(0, 4)), tA, tB, tpi)

        tA2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]]
        tB2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.4],
               [0.2, 0.1, 0.6, 0.1]]
        tpi2 = [0.3, 0.4, 0.3]
        th2 = mixtureHMM.getHMM(
            mixtureHMM.ghmm.IntegerRange(0, 4),
            mixtureHMM.ghmm.DiscreteDistribution(
                mixtureHMM.ghmm.IntegerRange(0, 4)), tA2, tB2, tpi2)

        tn1 = mixture.NormalDistribution(-1.5, 1.5)
        tn2 = mixture.NormalDistribution(9.0, 1.2)

        tmult1 = mixture.MultinomialDistribution(3,
                                                 4, [0.1, 0.1, 0.55, 0.25],
                                                 alphabet=self.DIAG)
        tmult2 = mixture.MultinomialDistribution(3,
                                                 4, [0.4, 0.3, 0.1, 0.2],
                                                 alphabet=self.DIAG)

        tc1 = mixture.ProductDistribution([tn1, tmult1, th1])
        tc2 = mixture.ProductDistribution([tn2, tmult2, th2])

        tmpi = [0.7, 0.3]
        tm = mixture.MixtureModel(2, tmpi, [tc1, tc2])

        tm.EM(data, 80, 0.1, silent=1)
Beispiel #6
0
def plot_fitting_progress(dummy, n_comp, mix_data, inits):
    plot_mix(mix_data.dataMatrix)
    i = n_comp
    data = mix_data
    print '# of components: {}\n'.format(i)
    rand_peaks = inits
    print rand_peaks
    pi = [1. / i] * i
    components = [dummy(p) for p in rand_peaks]
    m = mixture.MixtureModel(i, pi, copy.deepcopy(components))
    print 'Initial: {}\n'.format(m)
    _, llh = m.EM(data, 40, .1)
    print 'Final: {}\n'.format(m)
    for j in range(1, 41):
        print 'Iter {}\n=======\n'.format(j)
        m = mixture.MixtureModel(i, pi, copy.deepcopy(components))
        print 'Before:\n{}'.format(m)
        _, llh = m.EM(data, 1, .1)
        components = m.components
        pi = m.pi
        print 'After:\n{}'.format(m)
        for m in m.components:
            plot_N(*get_moments(m), col=(40 - j) / 60.)
    plt.show()
Beispiel #7
0
def getModel(G, p):
    """
    Constructs a PWM MixtureModel.
    
    @param G: number of components
    @param p: number of positions of the binding site
    @return: MixtureModel object
    """
    DNA = mixture.Alphabet(['A', 'C', 'G', 'T'])
    comps = []
    for i in range(G):
        dlist = []
        for j in range(p):
            phi = mixture.random_vector(4)
            dlist.append(mixture.DiscreteDistribution(4, phi, DNA))
        comps.append(mixture.ProductDistribution(dlist))
    pi = mixture.random_vector(G)
    m = mixture.MixtureModel(G, pi, comps)
    return m
    def testsimpleem(self):

        # sampling hmm data
        seq1 = self.h1.hmm.sample(40, 10)
        seq2 = self.h2.hmm.sample(60, 10)
        seq1.merge(seq2)

        data = mixtureHMM.SequenceDataSet()
        data.fromGHMM([], [seq1])

        tA = [[0.5, 0.2, 0.3], [0.2, 0.3, 0.5], [0.1, 0.5, 0.4]]
        tB = [[0.2, 0.4, 0.1, 0.3], [0.5, 0.1, 0.2, 0.2],
              [0.4, 0.3, 0.15, 0.15]]
        tpi = [0.3, 0.3, 0.4]
        th1 = mixture.ProductDistribution([
            mixtureHMM.getHMM(
                mixtureHMM.ghmm.IntegerRange(0, 4),
                mixtureHMM.ghmm.DiscreteDistribution(
                    mixtureHMM.ghmm.IntegerRange(0, 4)), tA, tB, tpi)
        ])

        tA2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]]
        tB2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.4],
               [0.2, 0.1, 0.6, 0.1]]
        tpi2 = [0.3, 0.4, 0.3]
        th2 = mixture.ProductDistribution([
            mixtureHMM.getHMM(
                mixtureHMM.ghmm.IntegerRange(0, 4),
                mixtureHMM.ghmm.DiscreteDistribution(
                    mixtureHMM.ghmm.IntegerRange(0, 4)), tA2, tB2, tpi2)
        ])

        mpi = [0.4, 0.6]
        hm = mixture.MixtureModel(2, mpi, [th1, th2])

        data.internalInit(hm)

        hm.EM(data, 80, 0.1, silent=1)
Beispiel #9
0
def getBackgroundModel(p, dist=None):
    """
    Construct background model
    
    @param p: number of positions of the binding site
    @param dist: background nucleotide frequencies, uniform is default
    
    @return: MixtureModel representing the background
    """
    DNA = mixture.Alphabet(['A', 'C', 'G', 'T'])
    dlist = []

    if dist == None:
        phi = [0.25] * 4
    else:
        phi = dist

    for j in range(p):
        dlist.append(mixture.DiscreteDistribution(4, phi, DNA))
    comps = [mixture.ProductDistribution(dlist)]

    m = mixture.MixtureModel(1, [1.0], comps)
    return m
    def testinternalinitcomplexempty(self):
        # complex DataSet with HMM sequences only

        # sampling hmm data
        seq1 = self.h1.hmm.sample(40, 10)
        seq2 = self.h2.hmm.sample(60, 10)
        seq1.merge(seq2)

        data = mixtureHMM.SequenceDataSet()
        data.fromGHMM([], [seq1])

        self.assertRaises(AssertionError, data.internalInit, self.m)

        c1 = mixture.ProductDistribution([self.h1])
        c2 = mixture.ProductDistribution([self.h2])

        mpi = [0.4, 0.6]
        hm = mixture.MixtureModel(2, mpi, [c1, c2])

        data.internalInit(hm)

        self.assertEqual(str(data.complexFeature), '[1]')
        self.assertEqual(data.p, 1)
        self.assertEqual(data.suff_p, 1)
def gaussian_decomposition(dist, max_comp=5, max_trials=10, max_steps=40):
    """ Performs Gaussian decomposition.

    Decompose the input distribution into best-fit Gaussian components.

    Args:
        dist: input distribution in 'mcerp' format.
        max_comp: maximum number of Gaussian components.
        max_trials: maximum number of trials beforing increasing # of comp. Larger is better/slower.
        max_steps: maximum steps in each fitting process. Larger is better/slower.

    Return:
        mixture: a list of tuple (mu, sigma) for the Gaussian components.
    """

    #mix = np.concatenate([np.random.normal(0, 1, [2000]), np.random.normal(6, 2, [4000]), np.random.normal(-3, 1.5, [1000])])
    mix = dist._mcpts

    data = mixture.DataSet()
    data.fromArray(mix)

    # TODO: what to set for init std? Sweep? Or some desired value for later analytical solving?
    std = 1.

    dummy = functools.partial(mixture.NormalDistribution, sigma=std)

    best_llh, best_peaks, best_mixture = None, None, None

    for i in range(1, 1 + max_comp):
        logging.debug('Gaussian Decomposition Iter: {}\n'.format(i))
        local_llh, local_peaks, local_mixture = None, None, None

        # Try max_trials times init sampling.
        for j in range(1, 1 + max_trials):
            pi = [1. / i] * i
            rand_peaks = np.random.choice(mix, i)
            components = [dummy(p) for p in rand_peaks]
            m = mixture.MixtureModel(i, pi, components)
            # Fixed convergence cretiria here.
            _, llh = m.EM(data, max_steps, .1, True)
            if local_llh is None:
                local_llh = llh
                local_peaks = rand_peaks
                local_mixture = m
            else:
                if llh > local_llh:
                    local_llh = llh
                    local_peaks = rand_peaks
                    local_mixture = m

        if best_llh is None:
            best_llh = local_llh
            best_peaks = local_peaks
            best_mixture = local_mixture
        else:
            if local_llh > best_llh:
                best_llh = local_llh
                best_peaks = local_peaks
                best_mixture = local_mixture

    logging.debug('BEST MIXTURE ({}):\n{}'.format(best_llh, best_mixture))
    # Plot the progress of fitting, cool figure awaits!
    #plot_fitting_progress(dummy, len(best_mixture.components), data, best_peaks)

    result = []
    for (comp, pi) in zip(best_mixture.components, best_mixture.pi):
        # comp is a ProductDistribution instance which may have (not in this case) multipul components too.
        assert (len(comp.distList) == 1)
        for d in comp:
            result.append((pi, d.mu, d.sigma))
    return result
            ##Peaks:  histmaxes
            ##Peak heights:  hist[histmaxes[n]]
            ##Stdev:  stdev

            emdata = mixture.DataSet()
            emdata.fromList(data[label][call])
            numpeaks = len(histmaxes)
            gaussian_objects = []
            weights = []
            for i in xrange(numpeaks):
                n = mixture.NormalDistribution(histmaxes[i], stdev)
                gaussian_objects.append(n)
                weights.append(hist[histmaxes[i]])
            totweight = float(sum(weights))
            weights = [x / totweight for x in weights]
            mymix = mixture.MixtureModel(numpeaks, weights, gaussian_objects)
            # print "Before",mymix
            mymix.EM(emdata, 40, 0.1)
            # print "After",mymix
            print("Number of peaks=", mymix.G)
            for i in range(mymix.G):
                print(mymix.pi[i], mymix.components[i])

            summary.write(patient)
            summary.write("\t" + sample)
            summary.write("\t" + str(len(data[label][call])))
            summary.write("\t" + str(call))
            summary.write("\t" + label)
            #            summary.write("\t" + str(mean*2))
            #            summary.write("\t" + str(stdev*2))
            for i in range(mymix.G):
Beispiel #13
0
def main():
    logger.debug('App started')

    parser = argparse.ArgumentParser(description='Key processing tool')
    parser.add_argument('-t',
                        '--threads',
                        dest='threads',
                        type=int,
                        default=None,
                        help='Number of threads to use for cert download')
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_const',
                        const=True,
                        help='enables debug mode')
    parser.add_argument('--verbose',
                        dest='verbose',
                        action='store_const',
                        const=True,
                        help='enables verbose mode')

    parser.add_argument('--dump-json',
                        dest='dump_json',
                        action='store_const',
                        const=True,
                        help='dumps JSON of the filtered certificates')
    parser.add_argument('--dump-cert',
                        dest='dump_cert',
                        action='store_const',
                        const=True,
                        help='dumps PEM of the filtered certificates')

    parser.add_argument(
        '-f',
        '--filter-org',
        dest='filter_org',
        help='Filter out certificates issued with given organization - regex')
    parser.add_argument(
        '--filter-domain',
        dest='filter_domain',
        help='Filter out certificates issued for the given domain - regex')

    parser.add_argument('--pubs',
                        dest='pubs',
                        nargs=argparse.ZERO_OR_MORE,
                        help='File with public keys (PEM)')

    parser.add_argument('--certs',
                        dest='certs',
                        nargs=argparse.ZERO_OR_MORE,
                        help='File with certificates (PEM)')

    parser.add_argument('--ossl',
                        dest='ossl',
                        type=int,
                        default=None,
                        help='OpenSSL generator')

    parser.add_argument('--per-key-stat',
                        dest='per_key_stat',
                        action='store_const',
                        const=True,
                        help='Print prob matching for each key')

    parser.add_argument('--subs',
                        dest='subs',
                        action='store_const',
                        const=True,
                        help='Plot random subgroups charts')
    parser.add_argument('--subs-k',
                        dest='subs_k',
                        type=int,
                        default=5,
                        help='Size of the subset')
    parser.add_argument('--subs-n',
                        dest='subs_n',
                        type=int,
                        default=1000,
                        help='Number of subsets to sample')

    parser.add_argument('--pca-src',
                        dest='pca_src',
                        action='store_const',
                        const=True,
                        help='Plot PCA sampled distribution vs collected one')
    parser.add_argument(
        '--pca-src-n',
        dest='pca_src_n',
        type=int,
        default=10000,
        help='Number of subsets to sample from source distributions')
    parser.add_argument('--pca-src-k',
                        dest='pca_src_k',
                        type=int,
                        default=3,
                        help='Size of the subset from the source distribution')

    parser.add_argument('--pca-grp',
                        dest='pca_grp',
                        action='store_const',
                        const=True,
                        help='Plot PCA on the input keys (groups)')

    parser.add_argument('--mixture',
                        dest='mixture',
                        action='store_const',
                        const=True,
                        help='Mixture distribution on masks - sources')

    parser.add_argument('--distrib',
                        dest='distrib',
                        action='store_const',
                        const=True,
                        help='Plot distributions - to the PDF')

    parser.add_argument('--distrib-mix',
                        dest='distribmix',
                        action='store_const',
                        const=True,
                        help='Plot distributions groups mixed with sources')

    parser.add_argument('--key-dist',
                        dest='plot_key_dist',
                        action='store_const',
                        const=True,
                        help='Plots key mask distribution')

    parser.add_argument('files',
                        nargs=argparse.ZERO_OR_MORE,
                        default=[],
                        help='file with ssl-dump json output')

    args = parser.parse_args()

    last_src_id = 0
    src_names = []
    masks_db = []
    masks_src = []
    cert_db = []
    keys_db = []

    # Input = ssl-dump output
    if len(args.files) > 0:
        # Cert Organization Filtering
        re_org = None if args.filter_org is None else re.compile(
            args.filter_org, re.IGNORECASE)
        # Domain filtering
        re_dom = None if args.filter_domain is None else re.compile(
            args.filter_domain, re.IGNORECASE)

        # Process files
        for fl in args.files:
            with open(fl, mode='r') as fh:
                data = fh.read()

                # Parse json out
                if '-----BEGIN JSON-----' in data:
                    if '-----END JSON-----' not in data:
                        raise ValueError('BEGIN JSON present but END JSON not')
                    match = re.search(
                        r'-----BEGIN JSON-----(.+?)-----END JSON-----', data,
                        re.MULTILINE | re.DOTALL)
                    if match is None:
                        raise ValueError('Could not extract JSON')
                    data = match.group(1)

                json_data = json.loads(data)
                for cert in json_data:
                    org = cert['org']
                    if org is None:
                        org = ''
                    if re_org is not None and re_org.match(org) is None:
                        if args.verbose:
                            print('Organization filtered out %s' % org)
                        continue
                    if re_dom is not None:
                        dom_match = re_dom.match(cert['cn']) is not None
                        for alt in cert['alts']:
                            dom_match |= re_dom.match(alt) is not None
                        if not dom_match:
                            if args.verbose:
                                print('Domain filtered out %s' % cert['cn'])
                            continue

                    cert_db.append(cert)
                    masks_db.append(cert['pubkey']['mask'])
                    masks_src.append(last_src_id)
            src_names.append(fl)
            last_src_id += 1

        if args.verbose:
            print('Certificate database size %d' % len(cert_db))

        if args.dump_json:
            print(json.dumps(cert_db))

        if args.dump_cert:
            for cert in cert_db:
                print cert['cert']

    # public key list processing
    if args.pubs is not None:
        for pubf in args.pubs:
            with open(pubf, mode='r') as fh:
                data = fh.read()
                keys = []
                for match in re.finditer(
                        r'-----BEGIN PUBLIC KEY-----(.+?)-----END PUBLIC KEY-----',
                        data, re.MULTILINE | re.DOTALL):
                    key = match.group(0)
                    keys.append(key)
                print('File %s keys num: %d' % (pubf, len(keys)))

                # pubkey -> mask
                for key in keys:
                    pub = serialization.load_pem_public_key(
                        key, utils.get_backend())
                    mask = keys_basic.compute_key_mask(pub.public_numbers().n)
                    keys_db.append(pub)
                    masks_db.append(mask)
                    masks_src.append(last_src_id)
            src_names.append(pubf)
            last_src_id += 1

    # extract public key from certificate
    if args.certs is not None:
        for certf in args.certs:
            with open(certf, mode='r') as fh:
                data = fh.read()
                certs = []
                for match in re.finditer(
                        r'-----BEGIN CERTIFICATE-----(.+?)-----END CERTIFICATE-----',
                        data, re.MULTILINE | re.DOTALL):
                    cert = match.group(0)
                    certs.append(cert)

                # cert -> mask
                for cert in certs:
                    x509 = utils.load_x509(str(cert))
                    pub = x509.public_key()
                    mask = keys_basic.compute_key_mask(pub.public_numbers().n)
                    keys_db.append(pub)
                    masks_db.append(mask)
                    masks_src.append(last_src_id)
            src_names.append(certf)
            last_src_id += 1

    # generate openssl keys on the fly
    if args.ossl is not None:
        for i in range(0, args.ossl):
            print('Generating RSA1024 key %03d' % i)
            key = OpenSSL.crypto.PKey()
            key.generate_key(OpenSSL.crypto.TYPE_RSA, 1024)
            key_pem = OpenSSL.crypto.dump_privatekey(
                OpenSSL.crypto.FILETYPE_PEM, key)

            priv = serialization.load_pem_private_key(key_pem, None,
                                                      utils.get_backend())
            mask = keys_basic.compute_key_mask(
                priv.public_key().public_numbers().n)
            keys_db.append(priv.public_key())
            masks_db.append(mask)
            masks_src.append(last_src_id)
        src_names.append('ossl-%d' % args.ossl)
        last_src_id += 1

    # Load statistics
    st = key_stats.KeyStats()
    st.load_tables()
    if args.verbose:
        print('Source stats: ')
        for src in st.sources_cn:
            print(' %30s: %08d' % (src, st.sources_cn[src]))
        print('Group stats:')
        for grp in st.groups:
            print(' %30s: %02d' % (grp, st.get_group_size(grp)))

    # mask indices
    mask_map, mask_max, mask_map_x, mask_map_y, mask_map_last_x, mask_map_last_y = keys_basic.generate_pubkey_mask_indices(
    )
    print('Max mask 1D config: [%d]' % mask_max)
    print('Max mask 2D config: [%d, %d]' % (mask_map_last_x, mask_map_last_y))

    # masks processing part
    if len(masks_db) == 0:
        return

    # Simple match
    if args.per_key_stat:
        print('Per-key matching: ')
        for idx, mask in enumerate(masks_db):
            print('Key %02d, mask: %s' % (idx, mask))

            res = []
            for src in st.table_prob:
                val = st.table_prob[src][mask]
                res.append((src, val if val is not None else 0))
            print_res(res, st)

    # Total key matching
    use_loglikelihood = True
    print('Fit for all keys in one distribution:')
    total_weights = src_total_match = comp_total_match_dict(
        masks_db, st, loglikelihood=use_loglikelihood)
    res = key_val_to_list(src_total_match)
    print_res(res, st, loglikelihood=use_loglikelihood)
    res = st.res_src_to_group(res)
    # bar_chart(res=res, title='Fit for all keys')

    # Avg + mean
    print('Avg + mean:')
    src_total_match = {}  # source -> [p1, p2, p3, p4, ..., p_keynum]
    for src in st.table_prob:
        src_total_match[src] = []
        for idx, mask in enumerate(masks_db):
            val = keys_basic.aggregate_mask(st.sources_masks_prob[src], mask)
            if use_loglikelihood:
                if total_weights[src] is not None:
                    src_total_match[src].append(val + total_weights[src])
                else:
                    src_total_match[src].append(-9999.9)
            else:
                src_total_match[src].append(val * total_weights[src])
            pass
        pass
    res = []
    devs = []
    for src in st.sources:
        m = np.mean(src_total_match[src])
        s = np.std(src_total_match[src])
        res.append((src, m))
        devs.append(s)

    # Total output
    print_res(res, st, error=devs, loglikelihood=use_loglikelihood)
    # bar_chart(res=res, error=devs, title='Avg for all keys + error')

    # PCA on the keys - groups
    keys_grp_vec = []
    for idx, mask in enumerate(masks_db):
        keys_grp_vec.append([])
        for src in st.groups:
            keys_grp_vec[idx].append(0)
        for idxs, src in enumerate(st.sources):
            grp = st.src_to_group(src)
            prob = st.table_prob[src][mask]
            keys_grp_vec[idx][st.get_group_idx(grp)] += prob

    if args.pca_grp:
        X = np.array(keys_grp_vec)
        pca = PCA(n_components=2)
        pca.fit(X)
        X_transformed = pca.transform(X)
        print('PCA mean: %s, components: ' % pca.mean_)
        print(pca.components_)

        masks_src_np = np.array(masks_src)
        plt.rcdefaults()
        colors = matplotlib.cm.rainbow(np.linspace(0, 1, last_src_id))
        for src_id in range(0, last_src_id):
            plt.scatter(X_transformed[masks_src_np == src_id, 0],
                        X_transformed[masks_src_np == src_id, 1],
                        label=src_names[src_id],
                        color=colors[src_id],
                        alpha=0.25,
                        marker=',')
        plt.legend(loc="best", shadow=False, scatterpoints=1)
        plt.show()

    # Random subset
    if args.subs:
        masks_db_tup = []
        for idx, mask in enumerate(masks_db):
            masks_db_tup.append((idx, mask, masks_src[idx]))

        # Many random subsets, top groups
        subs_size = args.subs_k
        subs_count = args.subs_n
        groups_cnt = {}
        subs_data = []
        subs_data_mark = []
        dsrc_num = last_src_id + 1

        # Take subs_count samples fro the input masks_db, evaluate it, prepare for PCA
        for i in range(0, subs_count):
            masks = random_subset(masks_db_tup, subs_size)
            src_total_match = comp_total_match_dict([x[1] for x in masks], st)
            res = key_val_to_list(src_total_match)

            total = 0.0
            for tup in res:
                total += tup[1]

            # data vectors for PCA
            tmp_data = []
            for idx, tmp_src in enumerate(st.sources):
                val = src_total_match[tmp_src]
                val = long(math.floor(val * (1000.0 / total)))
                tmp_data.append(val)

            # PCA on groups.
            # if want PCA on sources, use subs_data.append(tmp_data)
            subs_data.append(tmp_data)
            # res_grp_val = st.res_src_to_group(zip(st.sources, tmp_data))
            # subs_data.append([x[1] for x in res_grp_val])

            subs_dsources = {}
            max_dsrc = (0, 0)
            for dsrc in [x[2] for x in masks]:
                if dsrc not in subs_dsources:
                    subs_dsources[dsrc] = 0
                subs_dsources[dsrc] += 1

            for dsrc in subs_dsources:
                if subs_dsources[dsrc] > max_dsrc[1]:
                    max_dsrc = (dsrc, subs_dsources[dsrc])
            tmp_mark = max_dsrc[0]

            if max_dsrc[1] == subs_size:
                tmp_mark = max_dsrc[0]
            else:
                tmp_mark = last_src_id

            subs_data_mark.append(tmp_mark)

            for tup in res:
                src = tup[0]
                score = long(math.floor(tup[1] * (1000.0 / total)))
                if score == 0:
                    continue

                grp = st.src_to_group(src)
                if grp not in groups_cnt:
                    groups_cnt[grp] = score
                else:
                    groups_cnt[grp] += score

                if src not in groups_cnt:
                    groups_cnt[src] = score
                else:
                    groups_cnt[src] += score

            # Equalize group sizes
            for grp in st.groups:
                grp = grp.lower()
                if grp in groups_cnt:
                    groups_cnt[grp] /= float(st.get_group_size(grp))

            # best group only
            # best_src = res[0][0]
            # best_grp = st.src_to_group(best_src)
            # if best_grp not in groups_cnt:
            #     groups_cnt[best_grp] = 1
            # else:
            #     groups_cnt[best_grp] += 1

        print('Combinations: (N, k)=(%d, %d) = %d' %
              (subs_count, subs_size, scipy.misc.comb(subs_count, subs_size)))

        sources = st.groups
        values = []
        for source in sources:
            val = groups_cnt[source] if source in groups_cnt else 0
            values.append(val)
        bar_chart(sources,
                  values,
                  xlabel='# of occurrences as top group (best fit)',
                  title='Groups vs. %d random %d-subsets' %
                  (subs_count, subs_size))

        # PCA stuff
        X = np.array(subs_data)
        pca = PCA(n_components=2)
        pU, pS, pV = pca._fit(X)
        X_transformed = pca.transform(X)
        subs_data_mark_pca = np.array(subs_data_mark)

        print('Sources: ')
        print(st.sources)

        print('PCA input data shape %d x %d' %
              (len(subs_data), len(subs_data[0])))
        print('PCA mean: \n%s \nPCA components: \n' % pca.mean_)
        print(pca.components_)

        print('PCA components x: ')
        for x in pca.components_[0]:
            print x
        print('\nPCA components y: ')
        for y in pca.components_[1]:
            print y

        # print('\nPCA U,S,V')
        # print(pU)
        # print(pS)
        # print(pV)

        colors = ['blue', 'red', 'green', 'gray', 'yellow']

        plt.rcdefaults()
        for src_id in range(0, dsrc_num):
            plt.scatter(X_transformed[subs_data_mark_pca == src_id, 0],
                        X_transformed[subs_data_mark_pca == src_id, 1],
                        color=colors[src_id],
                        alpha=0.5 if src_id < dsrc_num - 1 else 0.2)
        plt.legend(loc="best", shadow=False, scatterpoints=1)

        # plt.scatter([x[0] for x in X_transformed],
        #             [x[1] for x in X_transformed],
        #             alpha=0.5)

        plt.show()

        # PCA against defined sources with known distributions?
        # Creates "background distribution" we want to match to
        if args.pca_src:
            # Four axes, returned as a 2-d array
            plt.rcdefaults()
            #f, axarr = plt.subplots(len(st.sources), 1)
            src_k = args.pca_src_k
            src_n = args.pca_src_n

            # prepare PDF
            ppdf = PdfPages('test.pdf')  # todo-filenae-from-set
            sources_to_test = st.sources[20:25] + [
                x for x in st.sources if 'micro' in x.lower()
            ]

            # compute for each source
            src_mark_idx = len(subs_data_mark)
            subs_data_src = subs_data
            subs_data_mark_src = subs_data_mark
            for src_idx, source in enumerate(sources_to_test):
                # cur_plot = axarr[src_idx]
                cur_plot = plt

                print('Plotting PCA source %s %d/%d' %
                      (source, src_idx + 1, len(sources_to_test)))

                # Extend subs_data_src with draws from the source distribution
                for i in range(0, src_n):
                    masks = []
                    for tmpk in range(0, src_k):
                        masks.append(st.sample_source_distrib(source))
                    src_total_match = comp_total_match_dict(masks, st)
                    res = key_val_to_list(src_total_match)

                    total = 0.0
                    for tup in res:
                        total += tup[1]

                    # data vectors for PCA
                    tmp_data = []
                    for idx, tmp_src in enumerate(st.sources):
                        val = src_total_match[tmp_src]
                        val = long(math.floor(val * (1000.0 / total)))
                        tmp_data.append(val)

                    # PCA on groups.
                    # if want PCA on sources, use subs_data.append(tmp_data)
                    subs_data_src.append(tmp_data)
                    subs_data_mark_src.append(src_mark_idx)

                # PCA stuff
                X = np.array(subs_data_src)
                pca = PCA(n_components=2)
                pU, pS, pV = pca._fit(X)
                X_transformed = pca.transform(X)
                subs_data_mark_pca = np.array(subs_data_mark_src)

                colors = ['blue', 'red', 'green', 'gray', 'yellow']

                # plot input sources
                for src_id in range(0, dsrc_num):
                    cur_plot.scatter(
                        X_transformed[subs_data_mark_pca == src_id, 0],
                        X_transformed[subs_data_mark_pca == src_id, 1],
                        color=colors[src_id],
                        alpha=0.5 if src_id < dsrc_num - 1 else 0.2)

                # plot the source stuff
                cur_plot.scatter(
                    X_transformed[subs_data_mark_pca == src_mark_idx, 0],
                    X_transformed[subs_data_mark_pca == src_mark_idx, 1],
                    color='gray',
                    marker='+',
                    alpha=0.05)

                cur_plot.legend(loc="best", shadow=False, scatterpoints=1)
                cur_plot.title('Src [%s] input: %s' % (source,
                                                       (', '.join(src_names))))

                cur_plot.savefig(ppdf, format='pdf')
                cur_plot.clf()

            print('Finalizing PDF...')
            # plt.savefig(ppdf, format='pdf')
            ppdf.close()
            pass

    if args.distrib:
        # Plotting distributions for groups, to the PDF
        plt.rcdefaults()
        ppdf = PdfPages('groups_distrib.pdf')

        # Compute for each source
        range_ = st.masks
        range_idx = np.arange(len(st.masks))
        for grp_idx, grp in enumerate(st.groups):
            cur_data = st.groups_masks_prob[grp]
            raw_data = [cur_data[x] for x in st.masks]
            cur_plot = plt

            logger.debug('Plotting distribution %02d/%02d : %s ' %
                         (grp_idx + 1, len(st.groups), grp))
            axes = cur_plot.gca()
            axes.set_xlim([0, len(st.masks)])
            cur_plot.bar(range_idx, raw_data, linewidth=0, width=0.4)
            cur_plot.title('%s (%s)' % (grp, get_group_desc(grp, st)))
            cur_plot.savefig(ppdf, format='pdf')
            cur_plot.clf()

        # Print input data - per source
        max_src = max(masks_src)
        bars = []
        for src_id in range(max_src + 1):
            axes = plt.gca()
            axes.set_xlim([0, len(st.masks)])

            map_data = {}
            for mask in st.masks:
                map_data[mask] = 0.0
            for mask_idx, mask in enumerate(masks_db):
                if masks_src[mask_idx] == src_id:
                    map_data[mask] += 1

            raw_data = []
            for mask in st.masks:
                raw_data.append(map_data[mask])

            b1 = plt.bar(range_idx, raw_data, linewidth=0, width=0.4)
            bars.append(b1)

            plt.title('Source %d' % src_id)
            plt.savefig(ppdf, format='pdf')
            plt.clf()

        # Group distribution + source:
        if args.distribmix:
            width = 0.25
            range_idx = np.arange(len(st.masks))

            # One source to the graph
            max_src = max(masks_src)
            cur_plot = plt
            for src_id in range(max_src + 1):

                bars = []
                logger.debug('Plotting mix distribution src %d ' % src_id)

                map_data = {}
                for mask in st.masks:
                    map_data[mask] = 0.0
                for mask_idx, mask in enumerate(masks_db):
                    if masks_src[mask_idx] == src_id:
                        map_data[mask] += 1

                raw_data = []
                for mask in st.masks:
                    raw_data.append(map_data[mask])
                raw_data = np.array(raw_data)
                raw_data /= float(sum(raw_data))

                for grp_idx, grp in enumerate(st.groups):
                    logger.debug(
                        ' - Plotting mix distribution %02d/%02d : %s ' %
                        (grp_idx + 1, len(st.groups), grp))

                    # Source
                    fig, ax = plt.subplots()
                    b1 = ax.bar(range_idx + width,
                                raw_data,
                                linewidth=0,
                                width=width,
                                color='r')
                    bars.append(b1)

                    # Group
                    cur_data2 = st.groups_masks_prob[grp]
                    raw_data2 = [cur_data2[x] for x in st.masks]

                    bar1 = ax.bar(range_idx,
                                  raw_data2,
                                  linewidth=0,
                                  width=width,
                                  color='b')
                    bars.append(bar1)

                    ax.legend(tuple([x[0] for x in bars]),
                              tuple(['Src %d' % src_id, grp]))
                    ax.set_xlim([0, len(st.masks)])

                    cur_plot.title('%s + source %d' % (grp, src_id))
                    cur_plot.savefig(ppdf, format='pdf')
                    cur_plot.clf()

        logger.info('Finishing PDF')
        ppdf.close()
        pass

    if args.mixture:
        # http://www.pymix.org/pymix/index.php?n=PyMix.Tutorial#bayesmix
        # 1. Create mixture model = add discrete distributions to the package
        dists = []
        alphabet = mixture.Alphabet(st.masks)
        taken_src = []

        for src in st.sources:
            if 'openssl 1.0.2g' == src or 'microsoft .net' == src:
                pass
            else:
                continue
            print(' - Source: %s' % src)

            taken_src.append(src)
            probs = []
            for m in st.masks:
                probs.append(st.sources_masks_prob[src][m])

            d = mixture.DiscreteDistribution(len(alphabet),
                                             probs,
                                             alphabet=alphabet)
            dists.append(d)

        # 2. Create the model, for now, with even distribution among components.
        comp_weights = [1.0 / len(dists)] * len(dists)
        mmodel = mixture.MixtureModel(len(dists), comp_weights, dists)
        print '-' * 80
        print mmodel
        print '-' * 80

        # dump mixtures to the file
        mixture.writeMixture(mmodel, 'src.mix')

        # 3. Input data - array of input masks
        masks_data = [[x] for x in masks_db]
        data = mixture.DataSet()
        data.fromList(masks_data)
        data.internalInit(mmodel)

        print masks_data
        print data
        print '---------'

        # 4. Compute EM
        # if there is a distribution in the input data which has zero matching inputs,
        # an exception will be thrown. Later - discard such source from the input...
        print mmodel.modelInitialization(data, 1)
        print('EM start: ')

        ress = []
        for r in range(10):
            mmodel.modelInitialization(data, 1)
            emres = mmodel.EM(data, 1000, 0.00000000000000001)
            ress.append(emres)
        emres = max(ress, key=lambda x: x[1])

        # print mmodel.randMaxEM(data, 10, 40, 0.1)
        print emres

        # Plot
        plt.rcdefaults()
        # plt.plot(range(0, len(emres[0][3])), [2.71828**x for x in emres[0][3]], 'o')
        # plt.plot(range(0, len(emres[0][3])), emres[0][3], 'k')
        # plt.show()

        for i in range(0, 5):
            print('-------')
            for idx, src in enumerate(emres[0]):
                print('- i:%02d src: %02d, val: %s' % (i, idx, src[i]))

        colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(taken_src)))
        range_ = range(0, len(emres[0][0]))
        bars = []
        for idx, src in enumerate(emres[0]):
            b1 = plt.bar(range_, [2.71828**x for x in src], color=colors[idx])
            bars.append(b1)

        plt.legend(tuple(bars), tuple(taken_src))
        plt.grid(True)
        plt.show()

        # for src in emres[0]:
        #     plt.plot(range(0, len(src)), [2.71828**x for x in src], 'o')
        #     # plt.grid(True)
        #     # plt.show()
        #
        # # plt.scatter(mask_map_last_x, mask_map_last_y, c='red', s=scale, alpha=0.3)
        # # plt.legend()
        # plt.grid(True)
        # plt.show()

    # Chisquare
    for source in st.sources_masks:
        cn = st.sources_cn[source]
        # chi = chisquare()
        # gen = keys_basic.generate_pubkey_mask()

    # 2D Key plot
    if args.plot_key_dist:
        plot_key_mask_dist(masks_db, st)
Beispiel #14
0
n1 = mixture.NormalDistribution(2.5, 0.5)
n2 = mixture.NormalDistribution(6.0, 0.8)

mult1 = mixture.MultinomialDistribution(3,
                                        4, [0.23, 0.26, 0.26, 0.25],
                                        alphabet=DIAG)
mult2 = mixture.MultinomialDistribution(3,
                                        4, [0.7, 0.1, 0.1, 0.1],
                                        alphabet=DIAG)

c1 = mixture.ProductDistribution([n1, mult1, h1])
c2 = mixture.ProductDistribution([n2, mult2, h2])

mpi = [0.4, 0.6]
m = mixture.MixtureModel(2, mpi, [c1, c2])

#print m
#print "-->",m.components[0].suff_dataRange

# ----------- constructing complex DataSet ----------------

# mixture for sampling
gc1 = mixture.ProductDistribution([n1, mult1])
gc2 = mixture.ProductDistribution([n2, mult2])
gen = mixture.MixtureModel(2, mpi, [gc1, gc2])

dat = gen.sampleSet(100)
#print dat

# sampling hmm data
Beispiel #15
0
n22 = mixture.NormalDistribution(-6.0, 0.5)
n23 = mixture.NormalDistribution(3.0, 0.7)
d24 = mixture.DiscreteDistribution(4, [0.1, 0.1, 0.4, 0.4])

c2 = mixture.ProductDistribution([n21, n22, n23, d24])

n31 = mixture.NormalDistribution(2.0, 0.5)
n32 = mixture.NormalDistribution(-3.0, 0.5)
n33 = mixture.NormalDistribution(3.0, 0.7)
d34 = mixture.DiscreteDistribution(4, [0.4, 0.3, 0.1, 0.2])

c3 = mixture.ProductDistribution([n31, n32, n33, d34])

# creating the model
pi = [0.4, 0.3, 0.3]
m = mixture.MixtureModel(3, pi, [c1, c2, c3])

# sampling of the training data
data = m.sampleDataSet(800)

#---------------------------------------------------

# setting up the five component model we are going to train

tn11 = mixture.NormalDistribution(1.0, 0.5)
tn12 = mixture.NormalDistribution(2.0, 0.5)
tn13 = mixture.NormalDistribution(-3.0, 0.5)
td14 = mixture.DiscreteDistribution(4, [0.25] * 4)

tc1 = mixture.ProductDistribution([tn11, tn12, tn13, td14])
Beispiel #16
0
    def find_threshold(self, user_params):
        """Finds the thresholds for errors given the data using Gaussian Mixture Model

        Args:
            data: The data to fit

        Kwargs:
            method: Whether to us [min,median,mean] of data in each bin
            thresh: Threshold for find_alpha
            bins: Number of pieces of the data we look at
            plot: Whether to plot the cdf and the two alpha cutoffs

        Returns:
            A soft threshold (alpha0) and A strong threshold (alpha1)

        Raises:
            
        """

        max_gauss_mixtures = user_params.get("max_gauss_mixtures")
        data = self.prob_smoothed

        #print data

        # http://www.pymix.org/pymix/index.php?n=PyMix.Tutorial

        # make two gaussains
        gaussian_one = mixture.NormalDistribution(numpy.mean(data),
                                                  numpy.std(data))
        gaussian_two = mixture.NormalDistribution(10.0 * numpy.mean(data),
                                                  numpy.std(data))

        mixture_model = mixture.MixtureModel(2, [0.99, 0.01],
                                             [gaussian_one, gaussian_two])

        # print mixture_model

        EM_tuned = False
        while not EM_tuned:
            # make mix_data from a random 10% of the original data
            index_array = numpy.arange(data.size)
            numpy.random.shuffle(index_array)
            mix_data = mixture.DataSet()
            data_size = numpy.min((int(numpy.floor(data.size / 10.0)), 50000))
            mix_data.fromArray(data[index_array[:data_size]])

            try:
                mixture_model.randMaxEM(mix_data,
                                        max_gauss_mixtures,
                                        40,
                                        0.001,
                                        silent=True)
                EM_tuned = True
            except AssertionError:
                # pymix likes to throw assertion errors when it has small machine precision errors...
                print "Caught an assertion error in pymix, randomizing input and trying again"
            except:
                print "pymix failed to find mixture model, using single gaussian"
                gaussian_two = mixture.NormalDistribution(
                    numpy.mean(data), numpy.std(data))
                EM_tuned = True

        #print mixture_model

        # hacky, no good api access to the model components
        gauss_one_mean = float(
            str(mixture_model.components[0][0]).split('[')[1].split(',')[0])
        gauss_one_std = float(
            str(mixture_model.components[0][0]).split(', ')[1].split(']')[0])

        gauss_two_mean = float(
            str(mixture_model.components[1][0]).split('[')[1].split(',')[0])
        gauss_two_std = float(
            str(mixture_model.components[1][0]).split(', ')[1].split(']')[0])

        print "Gauss1: mu: %f, std: %f" % (gauss_one_mean, gauss_one_std)
        print "Gauss2: mu: %f, std: %f" % (gauss_two_mean, gauss_two_std)

        #print "Using threshold %f" % threshold

        # inv normal cdf
        if gauss_one_mean > gauss_two_mean or mixture_model.pi[1] < 0.60:
            self.thresh_main_mean = gauss_one_mean
            self.thresh_main_std = gauss_one_std
        else:
            self.thresh_main_mean = gauss_two_mean
            self.thresh_main_std = gauss_two_std
Beispiel #17
0
# iq.txt = iq and achievement test fields from pheno.txt
# drd4_len.txt = drd4 vntr types, only number of repeats
data.fromFiles(["iq.txt", "phys.txt", "drd4_len.txt"])

COMOR = 11
G = 8
components = []
for i in range(G):

    # intelligence and achivement tests as univariate normal distributions. (TEST)
    bd_mu = float(random.randint(3, 16))
    bd_sigma = random.uniform(1.0, 8.0)
    missing_bd = mixture.NormalDistribution(-9999.9, 0.00001)
    dist_bd = mixture.NormalDistribution(bd_mu, bd_sigma)
    mix_bd = mixture.MixtureModel(2, [0.999, 0.001], [dist_bd, missing_bd],
                                  compFix=[0, 2])

    voc_mu = float(random.randint(3, 16))
    voc_sigma = random.uniform(1.0, 8.0)
    missing_voc = mixture.NormalDistribution(-9999.9, 0.00001)
    dist_voc = mixture.NormalDistribution(voc_mu, voc_sigma)
    mix_voc = mixture.MixtureModel(2, [0.999, 0.001], [dist_voc, missing_voc],
                                   compFix=[0, 2])

    read_mu = float(random.randint(80, 120))
    read_sigma = random.uniform(1.0, 28.0)
    missing_read = mixture.NormalDistribution(-9999.9, 0.00001)
    dist_read = mixture.NormalDistribution(read_mu, read_sigma)
    mix_read = mixture.MixtureModel(2, [0.999, 0.001],
                                    [dist_read, missing_read],
                                    compFix=[0, 2])
def getRandomMixture(G,
                     p,
                     KL_lower,
                     KL_upper,
                     dtypes='discgauss',
                     M=4,
                     seed=None):

    #    if seed:
    #        random.seed(seed)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        #print '*** seed=',seed
    #
    #    else: # XXX debug
    #        seed = random.randint(1,9000000)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        random.seed(seed)
    #        #print '*** seed=',seed

    #M = 4  # Alphabet size for discrete distributions

    min_sigma = 0.1  # minimal std for Normal
    max_sigma = 1.0  # maximal std for Normal
    min_mu = -5.0  # minimal mean
    max_mu = 8.0  # maximal mean

    if dtypes == 'disc':
        featureTypes = [0] * p
    elif dtypes == 'gauss':
        featureTypes = [1] * p
    elif dtypes == 'discgauss':
        # discrete or Normal features for now, chosen uniformly
        # 0 discrete, 1 Normal
        featureTypes = [random.choice((0, 1)) for i in range(p)]
    else:
        raise TypeError

    #print featureTypes

    C = []
    for j in range(p):
        c_j = []
        for i in range(G):
            #print i,j
            if featureTypes[j] == 0:
                acc = 0
                while acc == 0:
                    cand = mixture.DiscreteDistribution(
                        M, mixture.random_vector(M))

                    #print 'cand:',cand

                    acc = 1

                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(d, cand)
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            #print '  *', cand, 'rejected:', d , KL_dist
                            acc = 0
                            break

                c_j.append(cand)
            elif featureTypes[j] == 1:
                acc = 0
                while acc == 0:
                    mu = random.uniform(min_mu, max_mu)
                    sigma = random.uniform(min_sigma, max_sigma)

                    cand = mixture.NormalDistribution(mu, sigma)

                    #print 'cand:',cand

                    acc = 1

                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(d, cand)
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            #print '  *', cand, 'rejected:', d , KL_dist
                            acc = 0

                c_j.append(cand)

            else:
                RuntimeError

        C.append(c_j)


#    print '\n'
#    for cc in C:
#        print cc

    comps = []
    for i in range(G):
        comps.append(mixture.ProductDistribution([C[j][i] for j in range(p)]))

    pi = get_random_pi(G, 0.1)

    m = mixture.MixtureModel(G, pi, comps, struct=1)
    m.updateFreeParams()

    return m
Beispiel #19
0
def clustering(k, feature_cols, feature_domains, header, table, seeds,
               result_file):
    best_loglike = None
    best_model = None
    # Giant random seeding loop,

    data = mx.DataSet()
    data.fromArray(table)
    for r in range(1):
        #  weights = np.random.random_sample(k)
        #  weights_norm = weights / sum(weights)
        weights_norm = [1.0 / k] * k
        components = []
        for i in range(k):
            products = []
            for j in range(table.shape[1]):
                col_type = prep.get_col_type(feature_cols[j], header)
                col_id = feature_cols[j]

                if col_type == 'cat':
                    vals = feature_domains[col_id].keys()
                    cnt_vals = len(vals)
                    rand_dist = np.random.random_sample(cnt_vals)

                    dist = mx.DiscreteDistribution(cnt_vals,
                                                   rand_dist / sum(rand_dist),
                                                   mx.Alphabet(vals))

                elif col_type == 'num':
                    min_val = feature_domains[col_id]['min']
                    max_val = feature_domains[col_id]['max']
                    #  mean = random.uniform(min_val, max_val)
                    mean = seeds[header[col_id][0]][i]
                    stdev = (max_val - min_val) / 2.0 / k

                    dist = mx.NormalDistribution(mean, stdev)

                else:
                    sys.exit(1)
                products.append(dist)

            comp = mx.ProductDistribution(products)
            components.append(comp)

        mix_table = mx.MixtureModel(k, weights_norm, components)
        print mix_table

        #loglike = mix_table.randMaxEM(data,1,50,50)
        #print loglike
        #print mix_table
        if not best_loglike or loglike > best_loglike:
            #  best_loglike = loglike
            best_model = copy.copy(mix_table)


#data.internalInit(mix)
# mix_table.modelInitialization(data)
#  print best_loglike
#  print best_model

    labels = best_model.classify(data, None, None, 1)

    ## output clustering results

    # count cluster sizes on sampled data
    f = open(result_file + '.stats', 'w')
    cnt = {}
    for l in labels:
        cnt[l] = 1 if l not in cnt else cnt[l] + 1

    for l in cnt:
        f.write('%s %d %f%%\n' %
                (l, cnt[l], cnt[l] * 100.0 / sum(cnt.values())))
    f.close()

    mx.writeMixture(best_model, result_file + '.model')
    return best_model