Ejemplo n.º 1
0
def getBayesModel(G, p, mixPrior=None):
    """
    Constructs a PWM CSI BayesMixtureModel.
    
    @param G: number of components
    @param p: number of positions of the binding site
    @return: BayesMixtureModel object
    """

    if not mixPrior:
        piPrior = mixture.DirichletPrior(G, [1.0] * G)
        compPrior = []
        for i in range(p):
            compPrior.append(
                mixture.DirichletPrior(4, [1.02, 1.02, 1.02, 1.02]))

        # arbitrary values of struct and comp parameters. Values should be
        # reset by user using the structPriorHeuristic method.
        mixPrior = mixture.MixtureModelPrior(0.05, 0.05, piPrior, compPrior)

    DNA = mixture.Alphabet(['A', 'C', 'G', 'T'])
    comps = []
    for i in range(G):
        dlist = []
        for j in range(p):
            phi = mixture.random_vector(4)
            dlist.append(mixture.DiscreteDistribution(4, phi, DNA))
        comps.append(mixture.ProductDistribution(dlist))
    pi = mixture.random_vector(G)
    m = mixture.BayesMixtureModel(G, pi, comps, mixPrior, struct=1)
    return m
def readUCSCPrior(filename):
    """
    Reads files in the UCSC Dirichlet Mixture prior (DMP) format (http://www.soe.ucsc.edu/compbio/dirichlets/)
    and converts them into PyMix DirichletMixturePrior objects.
    
    Note that the alphabet in the UCSC priors does not contain the gap symbol. For the output DirichletMixturePrior
    the gap symbol is introduced with a parameter value of 0.01 in all components.
    
    @param filename: file in UCSC DMP format
    
    @return: DirichletMixturePrior object
    
    """
    f = open(filename,'r')

    ex1 = re.compile('Mixture=\s(\d+.\d+)')    
    ex2 = re.compile('Order\s*=\s+([A-Z\s]+)')
    ex3 = re.compile('Alpha=\s+([\d+.\d+\s+,\d+e-]+)')

    pi = []
    sigma = None
    dComp = []
    alpha_mat = []
    
    for l in f:
        l = mixture.chomp(l)
        m1 = ex1.match(l)
        if m1:
            pi.append( float(m1.groups(1)[0]))
        m2 = ex2.match(l)
        if m2:
            s = m2.groups(1)[0]
            sigma = s.split(' ')
            
        m3 = ex3.match(l)
        if m3:
            s = m3.groups(1)[0]
            alpha = s.split(' ')
            alpha = map(float,alpha)
            as =  alpha.pop(0) # first entry is the sum of the others -> remove
            alpha_mat.append(alpha)

    # intergrate gap character '-' into the alphabet
    sigma.append('-')
    alphabet = mixture.Alphabet(sigma)        
   
    for i in range(len(alpha_mat)):
        alpha_mat[i].append(0.01) # add hyper paramerter for '-'
        dComp.append( mixture.DirichletPrior(21,alpha_mat[i]) )

    prior = mixture.DirichletMixturePrior(len(dComp),21,pi,dComp)
    return alphabet,prior
Ejemplo n.º 3
0
def scanSequence(mix, bg, seq,scoring='mix'):
    """
    Scores all positions of a sequence with the given model and background.
    
    @param mix: MixtureModel object
    @param bg: background MixtureModel object
    @param seq: sequence as list of nucleotides
    @param scoring: flag to determine the scoring scheme used for the mixtures. 
      'compmax' means maximum density over the components, 'mix' means true mixture density
    
    @return: list of position-wise log-odd scores
    """
    # convert sequence to internal representation, alphabet of seq must be DNA
    alph = mixture.Alphabet(['A','C','G','T'])
    f = lambda x: alph.internal(x)
    seq=map(f,seq)
    
    dnr = mix.components[0].dist_nr

    # init with dummy value at first position
    s = numarray.array([[-1]+ seq[0:dnr-1]])
    
    
    score = []
    for i in range(dnr-1,len(seq),1):
        # shift query sequence by one position
        s[0] = numarray.concatenate( [s[0][1:],numarray.array([seq[i]])],0)

        if scoring == 'compmax':
            # score as maximum over components 
            c_m_l = numarray.zeros(mix.G,numarray.Float)
            for i in range(mix.G):
                c_m_l[i] = mix.components[i].pdf(s)[0]
            m_l = c_m_l.max()

        elif scoring == 'mix':
            m_l =   mix.pdf(s)[0]          
            
        bg_l = bg.pdf(s)[0]


        score.append(m_l-bg_l)

    return score
Ejemplo n.º 4
0
def getModel(G, p):
    """
    Constructs a PWM MixtureModel.
    
    @param G: number of components
    @param p: number of positions of the binding site
    @return: MixtureModel object
    """
    DNA = mixture.Alphabet(['A', 'C', 'G', 'T'])
    comps = []
    for i in range(G):
        dlist = []
        for j in range(p):
            phi = mixture.random_vector(4)
            dlist.append(mixture.DiscreteDistribution(4, phi, DNA))
        comps.append(mixture.ProductDistribution(dlist))
    pi = mixture.random_vector(G)
    m = mixture.MixtureModel(G, pi, comps)
    return m
    def setUp(self):
        # building generating models
        self.DIAG = mixture.Alphabet(['.', '0', '8', '1'])

        A = [[0.3, 0.6, 0.1], [0.0, 0.5, 0.5], [0.4, 0.2, 0.4]]
        B = [[0.5, 0.2, 0.1, 0.2], [0.5, 0.4, 0.05, 0.05],
             [0.8, 0.1, 0.05, 0.05]]
        pi = [1.0, 0.0, 0.0]
        self.h1 = mixtureHMM.getHMM(
            mixtureHMM.ghmm.IntegerRange(0, 4),
            mixtureHMM.ghmm.DiscreteDistribution(
                mixtureHMM.ghmm.IntegerRange(0, 4)), A, B, pi)

        A2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]]
        B2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.5], [0.2, 0.2, 0.3, 0.3]]
        pi2 = [0.6, 0.4, 0.0]
        self.h2 = mixtureHMM.getHMM(
            mixtureHMM.ghmm.IntegerRange(0, 4),
            mixtureHMM.ghmm.DiscreteDistribution(
                mixtureHMM.ghmm.IntegerRange(0, 4)), A2, B2, pi2)

        n1 = mixture.NormalDistribution(2.5, 0.5)
        n2 = mixture.NormalDistribution(6.0, 0.8)

        mult1 = mixture.MultinomialDistribution(3,
                                                4, [0.23, 0.26, 0.26, 0.25],
                                                alphabet=self.DIAG)
        mult2 = mixture.MultinomialDistribution(3,
                                                4, [0.7, 0.1, 0.1, 0.1],
                                                alphabet=self.DIAG)

        c1 = mixture.ProductDistribution([n1, mult1, self.h1])
        c2 = mixture.ProductDistribution([n2, mult2, self.h2])

        mpi = [0.4, 0.6]
        self.m = mixture.MixtureModel(2, mpi, [c1, c2])

        # mixture for sampling
        gc1 = mixture.ProductDistribution([n1, mult1])
        gc2 = mixture.ProductDistribution([n2, mult2])
        self.gen = mixture.MixtureModel(2, mpi, [gc1, gc2])
Ejemplo n.º 6
0
def getBackgroundModel(p, dist=None):
    """
    Construct background model
    
    @param p: number of positions of the binding site
    @param dist: background nucleotide frequencies, uniform is default
    
    @return: MixtureModel representing the background
    """
    DNA = mixture.Alphabet(['A', 'C', 'G', 'T'])
    dlist = []

    if dist == None:
        phi = [0.25] * 4
    else:
        phi = dist

    for j in range(p):
        dlist.append(mixture.DiscreteDistribution(4, phi, DNA))
    comps = [mixture.ProductDistribution(dlist)]

    m = mixture.MixtureModel(1, [1.0], comps)
    return m
Ejemplo n.º 7
0
import mixture
import random

VNTR = mixture.Alphabet([
    '.', '2/4', '2/7', '3/4', '3/7', '4/4', '4/6', '4/7', '4/8', '4/9', '7/7'
])
DIAG = mixture.Alphabet(['.', '0', '8', '1'])

data = mixture.DataSet()

# iq.txt = iq and achievement test fields from pheno.txt
# drd4_len.txt = drd4 vntr types, only number of repeats
data.fromFiles(["filt_WISC_WIAT_DISC_134.txt"])  # ,"DRD4_134_len.txt"

m = mixture.readMixture('pheno_best.py')

print "Without deterministic anealing:"
m.randMaxEM(data, 100, 30, 0.1, tilt=0, silent=0)

print "\nWith deterministic annealing:"
m.randMaxEM(data, 100, 30, 0.1, tilt=1, silent=0)
Ejemplo n.º 8
0
import mixture
import numpy
import random
import mixtureHMM

# building generating models
DIAG = mixture.Alphabet(['.', '0', '8', '1'])

A = [[0.3, 0.6, 0.1], [0.0, 0.5, 0.5], [0.4, 0.2, 0.4]]
B = [[0.5, 0.2, 0.1, 0.2], [0.5, 0.4, 0.05, 0.05], [0.8, 0.1, 0.05, 0.05]]
pi = [1.0, 0.0, 0.0]
h1 = mixtureHMM.getHMM(
    mixtureHMM.ghmm.IntegerRange(0, 4),
    mixtureHMM.ghmm.DiscreteDistribution(mixtureHMM.ghmm.IntegerRange(0, 4)),
    A, B, pi)

#seq = h1.hmm.sample(10,50)
#print seq

A2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]]
B2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.5], [0.2, 0.2, 0.3, 0.3]]
pi2 = [0.6, 0.4, 0.0]
h2 = mixtureHMM.getHMM(
    mixtureHMM.ghmm.IntegerRange(0, 4),
    mixtureHMM.ghmm.DiscreteDistribution(mixtureHMM.ghmm.IntegerRange(0, 4)),
    A2, B2, pi2)

n1 = mixture.NormalDistribution(2.5, 0.5)
n2 = mixture.NormalDistribution(6.0, 0.8)

mult1 = mixture.MultinomialDistribution(3,
Ejemplo n.º 9
0
#-----------------------------------------------------------------------------------

G = 3
p = 4
# Bayesian Mixture with three components and four discrete features
piPrior = mixture.DirichletDistribution(G,[1.0]*G)

compPrior= []
for i in range(2):
    compPrior.append( mixture.DirichletDistribution(4,[1.02,1.02,1.02,1.02]) )
for i in range(2):
    compPrior.append( mixture.NormalGammaDistribution( 1.0,2.0,3.0,4.0 ) )

mixPrior = mixture.MixturePrior(0.7,0.7,piPrior, compPrior)

DNA = mixture.Alphabet(['A','C','G','T'])
comps = []
for i in range(G):
    dlist = []
    for j in range(2):
       phi = mixture.random_vector(4)
       dlist.append( mixture.DiscreteDistribution(4,phi,DNA))
    for j in range(2):
       mu = j+1.0
       sigma = j+0.5
       dlist.append( mixture.NormalDistribution(mu,sigma))


    comps.append(mixture.ProductDistribution(dlist))
pi = mixture.random_vector(G)
Ejemplo n.º 10
0
def main():
    logger.debug('App started')

    parser = argparse.ArgumentParser(description='Key processing tool')
    parser.add_argument('-t',
                        '--threads',
                        dest='threads',
                        type=int,
                        default=None,
                        help='Number of threads to use for cert download')
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_const',
                        const=True,
                        help='enables debug mode')
    parser.add_argument('--verbose',
                        dest='verbose',
                        action='store_const',
                        const=True,
                        help='enables verbose mode')

    parser.add_argument('--dump-json',
                        dest='dump_json',
                        action='store_const',
                        const=True,
                        help='dumps JSON of the filtered certificates')
    parser.add_argument('--dump-cert',
                        dest='dump_cert',
                        action='store_const',
                        const=True,
                        help='dumps PEM of the filtered certificates')

    parser.add_argument(
        '-f',
        '--filter-org',
        dest='filter_org',
        help='Filter out certificates issued with given organization - regex')
    parser.add_argument(
        '--filter-domain',
        dest='filter_domain',
        help='Filter out certificates issued for the given domain - regex')

    parser.add_argument('--pubs',
                        dest='pubs',
                        nargs=argparse.ZERO_OR_MORE,
                        help='File with public keys (PEM)')

    parser.add_argument('--certs',
                        dest='certs',
                        nargs=argparse.ZERO_OR_MORE,
                        help='File with certificates (PEM)')

    parser.add_argument('--ossl',
                        dest='ossl',
                        type=int,
                        default=None,
                        help='OpenSSL generator')

    parser.add_argument('--per-key-stat',
                        dest='per_key_stat',
                        action='store_const',
                        const=True,
                        help='Print prob matching for each key')

    parser.add_argument('--subs',
                        dest='subs',
                        action='store_const',
                        const=True,
                        help='Plot random subgroups charts')
    parser.add_argument('--subs-k',
                        dest='subs_k',
                        type=int,
                        default=5,
                        help='Size of the subset')
    parser.add_argument('--subs-n',
                        dest='subs_n',
                        type=int,
                        default=1000,
                        help='Number of subsets to sample')

    parser.add_argument('--pca-src',
                        dest='pca_src',
                        action='store_const',
                        const=True,
                        help='Plot PCA sampled distribution vs collected one')
    parser.add_argument(
        '--pca-src-n',
        dest='pca_src_n',
        type=int,
        default=10000,
        help='Number of subsets to sample from source distributions')
    parser.add_argument('--pca-src-k',
                        dest='pca_src_k',
                        type=int,
                        default=3,
                        help='Size of the subset from the source distribution')

    parser.add_argument('--pca-grp',
                        dest='pca_grp',
                        action='store_const',
                        const=True,
                        help='Plot PCA on the input keys (groups)')

    parser.add_argument('--mixture',
                        dest='mixture',
                        action='store_const',
                        const=True,
                        help='Mixture distribution on masks - sources')

    parser.add_argument('--distrib',
                        dest='distrib',
                        action='store_const',
                        const=True,
                        help='Plot distributions - to the PDF')

    parser.add_argument('--distrib-mix',
                        dest='distribmix',
                        action='store_const',
                        const=True,
                        help='Plot distributions groups mixed with sources')

    parser.add_argument('--key-dist',
                        dest='plot_key_dist',
                        action='store_const',
                        const=True,
                        help='Plots key mask distribution')

    parser.add_argument('files',
                        nargs=argparse.ZERO_OR_MORE,
                        default=[],
                        help='file with ssl-dump json output')

    args = parser.parse_args()

    last_src_id = 0
    src_names = []
    masks_db = []
    masks_src = []
    cert_db = []
    keys_db = []

    # Input = ssl-dump output
    if len(args.files) > 0:
        # Cert Organization Filtering
        re_org = None if args.filter_org is None else re.compile(
            args.filter_org, re.IGNORECASE)
        # Domain filtering
        re_dom = None if args.filter_domain is None else re.compile(
            args.filter_domain, re.IGNORECASE)

        # Process files
        for fl in args.files:
            with open(fl, mode='r') as fh:
                data = fh.read()

                # Parse json out
                if '-----BEGIN JSON-----' in data:
                    if '-----END JSON-----' not in data:
                        raise ValueError('BEGIN JSON present but END JSON not')
                    match = re.search(
                        r'-----BEGIN JSON-----(.+?)-----END JSON-----', data,
                        re.MULTILINE | re.DOTALL)
                    if match is None:
                        raise ValueError('Could not extract JSON')
                    data = match.group(1)

                json_data = json.loads(data)
                for cert in json_data:
                    org = cert['org']
                    if org is None:
                        org = ''
                    if re_org is not None and re_org.match(org) is None:
                        if args.verbose:
                            print('Organization filtered out %s' % org)
                        continue
                    if re_dom is not None:
                        dom_match = re_dom.match(cert['cn']) is not None
                        for alt in cert['alts']:
                            dom_match |= re_dom.match(alt) is not None
                        if not dom_match:
                            if args.verbose:
                                print('Domain filtered out %s' % cert['cn'])
                            continue

                    cert_db.append(cert)
                    masks_db.append(cert['pubkey']['mask'])
                    masks_src.append(last_src_id)
            src_names.append(fl)
            last_src_id += 1

        if args.verbose:
            print('Certificate database size %d' % len(cert_db))

        if args.dump_json:
            print(json.dumps(cert_db))

        if args.dump_cert:
            for cert in cert_db:
                print cert['cert']

    # public key list processing
    if args.pubs is not None:
        for pubf in args.pubs:
            with open(pubf, mode='r') as fh:
                data = fh.read()
                keys = []
                for match in re.finditer(
                        r'-----BEGIN PUBLIC KEY-----(.+?)-----END PUBLIC KEY-----',
                        data, re.MULTILINE | re.DOTALL):
                    key = match.group(0)
                    keys.append(key)
                print('File %s keys num: %d' % (pubf, len(keys)))

                # pubkey -> mask
                for key in keys:
                    pub = serialization.load_pem_public_key(
                        key, utils.get_backend())
                    mask = keys_basic.compute_key_mask(pub.public_numbers().n)
                    keys_db.append(pub)
                    masks_db.append(mask)
                    masks_src.append(last_src_id)
            src_names.append(pubf)
            last_src_id += 1

    # extract public key from certificate
    if args.certs is not None:
        for certf in args.certs:
            with open(certf, mode='r') as fh:
                data = fh.read()
                certs = []
                for match in re.finditer(
                        r'-----BEGIN CERTIFICATE-----(.+?)-----END CERTIFICATE-----',
                        data, re.MULTILINE | re.DOTALL):
                    cert = match.group(0)
                    certs.append(cert)

                # cert -> mask
                for cert in certs:
                    x509 = utils.load_x509(str(cert))
                    pub = x509.public_key()
                    mask = keys_basic.compute_key_mask(pub.public_numbers().n)
                    keys_db.append(pub)
                    masks_db.append(mask)
                    masks_src.append(last_src_id)
            src_names.append(certf)
            last_src_id += 1

    # generate openssl keys on the fly
    if args.ossl is not None:
        for i in range(0, args.ossl):
            print('Generating RSA1024 key %03d' % i)
            key = OpenSSL.crypto.PKey()
            key.generate_key(OpenSSL.crypto.TYPE_RSA, 1024)
            key_pem = OpenSSL.crypto.dump_privatekey(
                OpenSSL.crypto.FILETYPE_PEM, key)

            priv = serialization.load_pem_private_key(key_pem, None,
                                                      utils.get_backend())
            mask = keys_basic.compute_key_mask(
                priv.public_key().public_numbers().n)
            keys_db.append(priv.public_key())
            masks_db.append(mask)
            masks_src.append(last_src_id)
        src_names.append('ossl-%d' % args.ossl)
        last_src_id += 1

    # Load statistics
    st = key_stats.KeyStats()
    st.load_tables()
    if args.verbose:
        print('Source stats: ')
        for src in st.sources_cn:
            print(' %30s: %08d' % (src, st.sources_cn[src]))
        print('Group stats:')
        for grp in st.groups:
            print(' %30s: %02d' % (grp, st.get_group_size(grp)))

    # mask indices
    mask_map, mask_max, mask_map_x, mask_map_y, mask_map_last_x, mask_map_last_y = keys_basic.generate_pubkey_mask_indices(
    )
    print('Max mask 1D config: [%d]' % mask_max)
    print('Max mask 2D config: [%d, %d]' % (mask_map_last_x, mask_map_last_y))

    # masks processing part
    if len(masks_db) == 0:
        return

    # Simple match
    if args.per_key_stat:
        print('Per-key matching: ')
        for idx, mask in enumerate(masks_db):
            print('Key %02d, mask: %s' % (idx, mask))

            res = []
            for src in st.table_prob:
                val = st.table_prob[src][mask]
                res.append((src, val if val is not None else 0))
            print_res(res, st)

    # Total key matching
    use_loglikelihood = True
    print('Fit for all keys in one distribution:')
    total_weights = src_total_match = comp_total_match_dict(
        masks_db, st, loglikelihood=use_loglikelihood)
    res = key_val_to_list(src_total_match)
    print_res(res, st, loglikelihood=use_loglikelihood)
    res = st.res_src_to_group(res)
    # bar_chart(res=res, title='Fit for all keys')

    # Avg + mean
    print('Avg + mean:')
    src_total_match = {}  # source -> [p1, p2, p3, p4, ..., p_keynum]
    for src in st.table_prob:
        src_total_match[src] = []
        for idx, mask in enumerate(masks_db):
            val = keys_basic.aggregate_mask(st.sources_masks_prob[src], mask)
            if use_loglikelihood:
                if total_weights[src] is not None:
                    src_total_match[src].append(val + total_weights[src])
                else:
                    src_total_match[src].append(-9999.9)
            else:
                src_total_match[src].append(val * total_weights[src])
            pass
        pass
    res = []
    devs = []
    for src in st.sources:
        m = np.mean(src_total_match[src])
        s = np.std(src_total_match[src])
        res.append((src, m))
        devs.append(s)

    # Total output
    print_res(res, st, error=devs, loglikelihood=use_loglikelihood)
    # bar_chart(res=res, error=devs, title='Avg for all keys + error')

    # PCA on the keys - groups
    keys_grp_vec = []
    for idx, mask in enumerate(masks_db):
        keys_grp_vec.append([])
        for src in st.groups:
            keys_grp_vec[idx].append(0)
        for idxs, src in enumerate(st.sources):
            grp = st.src_to_group(src)
            prob = st.table_prob[src][mask]
            keys_grp_vec[idx][st.get_group_idx(grp)] += prob

    if args.pca_grp:
        X = np.array(keys_grp_vec)
        pca = PCA(n_components=2)
        pca.fit(X)
        X_transformed = pca.transform(X)
        print('PCA mean: %s, components: ' % pca.mean_)
        print(pca.components_)

        masks_src_np = np.array(masks_src)
        plt.rcdefaults()
        colors = matplotlib.cm.rainbow(np.linspace(0, 1, last_src_id))
        for src_id in range(0, last_src_id):
            plt.scatter(X_transformed[masks_src_np == src_id, 0],
                        X_transformed[masks_src_np == src_id, 1],
                        label=src_names[src_id],
                        color=colors[src_id],
                        alpha=0.25,
                        marker=',')
        plt.legend(loc="best", shadow=False, scatterpoints=1)
        plt.show()

    # Random subset
    if args.subs:
        masks_db_tup = []
        for idx, mask in enumerate(masks_db):
            masks_db_tup.append((idx, mask, masks_src[idx]))

        # Many random subsets, top groups
        subs_size = args.subs_k
        subs_count = args.subs_n
        groups_cnt = {}
        subs_data = []
        subs_data_mark = []
        dsrc_num = last_src_id + 1

        # Take subs_count samples fro the input masks_db, evaluate it, prepare for PCA
        for i in range(0, subs_count):
            masks = random_subset(masks_db_tup, subs_size)
            src_total_match = comp_total_match_dict([x[1] for x in masks], st)
            res = key_val_to_list(src_total_match)

            total = 0.0
            for tup in res:
                total += tup[1]

            # data vectors for PCA
            tmp_data = []
            for idx, tmp_src in enumerate(st.sources):
                val = src_total_match[tmp_src]
                val = long(math.floor(val * (1000.0 / total)))
                tmp_data.append(val)

            # PCA on groups.
            # if want PCA on sources, use subs_data.append(tmp_data)
            subs_data.append(tmp_data)
            # res_grp_val = st.res_src_to_group(zip(st.sources, tmp_data))
            # subs_data.append([x[1] for x in res_grp_val])

            subs_dsources = {}
            max_dsrc = (0, 0)
            for dsrc in [x[2] for x in masks]:
                if dsrc not in subs_dsources:
                    subs_dsources[dsrc] = 0
                subs_dsources[dsrc] += 1

            for dsrc in subs_dsources:
                if subs_dsources[dsrc] > max_dsrc[1]:
                    max_dsrc = (dsrc, subs_dsources[dsrc])
            tmp_mark = max_dsrc[0]

            if max_dsrc[1] == subs_size:
                tmp_mark = max_dsrc[0]
            else:
                tmp_mark = last_src_id

            subs_data_mark.append(tmp_mark)

            for tup in res:
                src = tup[0]
                score = long(math.floor(tup[1] * (1000.0 / total)))
                if score == 0:
                    continue

                grp = st.src_to_group(src)
                if grp not in groups_cnt:
                    groups_cnt[grp] = score
                else:
                    groups_cnt[grp] += score

                if src not in groups_cnt:
                    groups_cnt[src] = score
                else:
                    groups_cnt[src] += score

            # Equalize group sizes
            for grp in st.groups:
                grp = grp.lower()
                if grp in groups_cnt:
                    groups_cnt[grp] /= float(st.get_group_size(grp))

            # best group only
            # best_src = res[0][0]
            # best_grp = st.src_to_group(best_src)
            # if best_grp not in groups_cnt:
            #     groups_cnt[best_grp] = 1
            # else:
            #     groups_cnt[best_grp] += 1

        print('Combinations: (N, k)=(%d, %d) = %d' %
              (subs_count, subs_size, scipy.misc.comb(subs_count, subs_size)))

        sources = st.groups
        values = []
        for source in sources:
            val = groups_cnt[source] if source in groups_cnt else 0
            values.append(val)
        bar_chart(sources,
                  values,
                  xlabel='# of occurrences as top group (best fit)',
                  title='Groups vs. %d random %d-subsets' %
                  (subs_count, subs_size))

        # PCA stuff
        X = np.array(subs_data)
        pca = PCA(n_components=2)
        pU, pS, pV = pca._fit(X)
        X_transformed = pca.transform(X)
        subs_data_mark_pca = np.array(subs_data_mark)

        print('Sources: ')
        print(st.sources)

        print('PCA input data shape %d x %d' %
              (len(subs_data), len(subs_data[0])))
        print('PCA mean: \n%s \nPCA components: \n' % pca.mean_)
        print(pca.components_)

        print('PCA components x: ')
        for x in pca.components_[0]:
            print x
        print('\nPCA components y: ')
        for y in pca.components_[1]:
            print y

        # print('\nPCA U,S,V')
        # print(pU)
        # print(pS)
        # print(pV)

        colors = ['blue', 'red', 'green', 'gray', 'yellow']

        plt.rcdefaults()
        for src_id in range(0, dsrc_num):
            plt.scatter(X_transformed[subs_data_mark_pca == src_id, 0],
                        X_transformed[subs_data_mark_pca == src_id, 1],
                        color=colors[src_id],
                        alpha=0.5 if src_id < dsrc_num - 1 else 0.2)
        plt.legend(loc="best", shadow=False, scatterpoints=1)

        # plt.scatter([x[0] for x in X_transformed],
        #             [x[1] for x in X_transformed],
        #             alpha=0.5)

        plt.show()

        # PCA against defined sources with known distributions?
        # Creates "background distribution" we want to match to
        if args.pca_src:
            # Four axes, returned as a 2-d array
            plt.rcdefaults()
            #f, axarr = plt.subplots(len(st.sources), 1)
            src_k = args.pca_src_k
            src_n = args.pca_src_n

            # prepare PDF
            ppdf = PdfPages('test.pdf')  # todo-filenae-from-set
            sources_to_test = st.sources[20:25] + [
                x for x in st.sources if 'micro' in x.lower()
            ]

            # compute for each source
            src_mark_idx = len(subs_data_mark)
            subs_data_src = subs_data
            subs_data_mark_src = subs_data_mark
            for src_idx, source in enumerate(sources_to_test):
                # cur_plot = axarr[src_idx]
                cur_plot = plt

                print('Plotting PCA source %s %d/%d' %
                      (source, src_idx + 1, len(sources_to_test)))

                # Extend subs_data_src with draws from the source distribution
                for i in range(0, src_n):
                    masks = []
                    for tmpk in range(0, src_k):
                        masks.append(st.sample_source_distrib(source))
                    src_total_match = comp_total_match_dict(masks, st)
                    res = key_val_to_list(src_total_match)

                    total = 0.0
                    for tup in res:
                        total += tup[1]

                    # data vectors for PCA
                    tmp_data = []
                    for idx, tmp_src in enumerate(st.sources):
                        val = src_total_match[tmp_src]
                        val = long(math.floor(val * (1000.0 / total)))
                        tmp_data.append(val)

                    # PCA on groups.
                    # if want PCA on sources, use subs_data.append(tmp_data)
                    subs_data_src.append(tmp_data)
                    subs_data_mark_src.append(src_mark_idx)

                # PCA stuff
                X = np.array(subs_data_src)
                pca = PCA(n_components=2)
                pU, pS, pV = pca._fit(X)
                X_transformed = pca.transform(X)
                subs_data_mark_pca = np.array(subs_data_mark_src)

                colors = ['blue', 'red', 'green', 'gray', 'yellow']

                # plot input sources
                for src_id in range(0, dsrc_num):
                    cur_plot.scatter(
                        X_transformed[subs_data_mark_pca == src_id, 0],
                        X_transformed[subs_data_mark_pca == src_id, 1],
                        color=colors[src_id],
                        alpha=0.5 if src_id < dsrc_num - 1 else 0.2)

                # plot the source stuff
                cur_plot.scatter(
                    X_transformed[subs_data_mark_pca == src_mark_idx, 0],
                    X_transformed[subs_data_mark_pca == src_mark_idx, 1],
                    color='gray',
                    marker='+',
                    alpha=0.05)

                cur_plot.legend(loc="best", shadow=False, scatterpoints=1)
                cur_plot.title('Src [%s] input: %s' % (source,
                                                       (', '.join(src_names))))

                cur_plot.savefig(ppdf, format='pdf')
                cur_plot.clf()

            print('Finalizing PDF...')
            # plt.savefig(ppdf, format='pdf')
            ppdf.close()
            pass

    if args.distrib:
        # Plotting distributions for groups, to the PDF
        plt.rcdefaults()
        ppdf = PdfPages('groups_distrib.pdf')

        # Compute for each source
        range_ = st.masks
        range_idx = np.arange(len(st.masks))
        for grp_idx, grp in enumerate(st.groups):
            cur_data = st.groups_masks_prob[grp]
            raw_data = [cur_data[x] for x in st.masks]
            cur_plot = plt

            logger.debug('Plotting distribution %02d/%02d : %s ' %
                         (grp_idx + 1, len(st.groups), grp))
            axes = cur_plot.gca()
            axes.set_xlim([0, len(st.masks)])
            cur_plot.bar(range_idx, raw_data, linewidth=0, width=0.4)
            cur_plot.title('%s (%s)' % (grp, get_group_desc(grp, st)))
            cur_plot.savefig(ppdf, format='pdf')
            cur_plot.clf()

        # Print input data - per source
        max_src = max(masks_src)
        bars = []
        for src_id in range(max_src + 1):
            axes = plt.gca()
            axes.set_xlim([0, len(st.masks)])

            map_data = {}
            for mask in st.masks:
                map_data[mask] = 0.0
            for mask_idx, mask in enumerate(masks_db):
                if masks_src[mask_idx] == src_id:
                    map_data[mask] += 1

            raw_data = []
            for mask in st.masks:
                raw_data.append(map_data[mask])

            b1 = plt.bar(range_idx, raw_data, linewidth=0, width=0.4)
            bars.append(b1)

            plt.title('Source %d' % src_id)
            plt.savefig(ppdf, format='pdf')
            plt.clf()

        # Group distribution + source:
        if args.distribmix:
            width = 0.25
            range_idx = np.arange(len(st.masks))

            # One source to the graph
            max_src = max(masks_src)
            cur_plot = plt
            for src_id in range(max_src + 1):

                bars = []
                logger.debug('Plotting mix distribution src %d ' % src_id)

                map_data = {}
                for mask in st.masks:
                    map_data[mask] = 0.0
                for mask_idx, mask in enumerate(masks_db):
                    if masks_src[mask_idx] == src_id:
                        map_data[mask] += 1

                raw_data = []
                for mask in st.masks:
                    raw_data.append(map_data[mask])
                raw_data = np.array(raw_data)
                raw_data /= float(sum(raw_data))

                for grp_idx, grp in enumerate(st.groups):
                    logger.debug(
                        ' - Plotting mix distribution %02d/%02d : %s ' %
                        (grp_idx + 1, len(st.groups), grp))

                    # Source
                    fig, ax = plt.subplots()
                    b1 = ax.bar(range_idx + width,
                                raw_data,
                                linewidth=0,
                                width=width,
                                color='r')
                    bars.append(b1)

                    # Group
                    cur_data2 = st.groups_masks_prob[grp]
                    raw_data2 = [cur_data2[x] for x in st.masks]

                    bar1 = ax.bar(range_idx,
                                  raw_data2,
                                  linewidth=0,
                                  width=width,
                                  color='b')
                    bars.append(bar1)

                    ax.legend(tuple([x[0] for x in bars]),
                              tuple(['Src %d' % src_id, grp]))
                    ax.set_xlim([0, len(st.masks)])

                    cur_plot.title('%s + source %d' % (grp, src_id))
                    cur_plot.savefig(ppdf, format='pdf')
                    cur_plot.clf()

        logger.info('Finishing PDF')
        ppdf.close()
        pass

    if args.mixture:
        # http://www.pymix.org/pymix/index.php?n=PyMix.Tutorial#bayesmix
        # 1. Create mixture model = add discrete distributions to the package
        dists = []
        alphabet = mixture.Alphabet(st.masks)
        taken_src = []

        for src in st.sources:
            if 'openssl 1.0.2g' == src or 'microsoft .net' == src:
                pass
            else:
                continue
            print(' - Source: %s' % src)

            taken_src.append(src)
            probs = []
            for m in st.masks:
                probs.append(st.sources_masks_prob[src][m])

            d = mixture.DiscreteDistribution(len(alphabet),
                                             probs,
                                             alphabet=alphabet)
            dists.append(d)

        # 2. Create the model, for now, with even distribution among components.
        comp_weights = [1.0 / len(dists)] * len(dists)
        mmodel = mixture.MixtureModel(len(dists), comp_weights, dists)
        print '-' * 80
        print mmodel
        print '-' * 80

        # dump mixtures to the file
        mixture.writeMixture(mmodel, 'src.mix')

        # 3. Input data - array of input masks
        masks_data = [[x] for x in masks_db]
        data = mixture.DataSet()
        data.fromList(masks_data)
        data.internalInit(mmodel)

        print masks_data
        print data
        print '---------'

        # 4. Compute EM
        # if there is a distribution in the input data which has zero matching inputs,
        # an exception will be thrown. Later - discard such source from the input...
        print mmodel.modelInitialization(data, 1)
        print('EM start: ')

        ress = []
        for r in range(10):
            mmodel.modelInitialization(data, 1)
            emres = mmodel.EM(data, 1000, 0.00000000000000001)
            ress.append(emres)
        emres = max(ress, key=lambda x: x[1])

        # print mmodel.randMaxEM(data, 10, 40, 0.1)
        print emres

        # Plot
        plt.rcdefaults()
        # plt.plot(range(0, len(emres[0][3])), [2.71828**x for x in emres[0][3]], 'o')
        # plt.plot(range(0, len(emres[0][3])), emres[0][3], 'k')
        # plt.show()

        for i in range(0, 5):
            print('-------')
            for idx, src in enumerate(emres[0]):
                print('- i:%02d src: %02d, val: %s' % (i, idx, src[i]))

        colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(taken_src)))
        range_ = range(0, len(emres[0][0]))
        bars = []
        for idx, src in enumerate(emres[0]):
            b1 = plt.bar(range_, [2.71828**x for x in src], color=colors[idx])
            bars.append(b1)

        plt.legend(tuple(bars), tuple(taken_src))
        plt.grid(True)
        plt.show()

        # for src in emres[0]:
        #     plt.plot(range(0, len(src)), [2.71828**x for x in src], 'o')
        #     # plt.grid(True)
        #     # plt.show()
        #
        # # plt.scatter(mask_map_last_x, mask_map_last_y, c='red', s=scale, alpha=0.3)
        # # plt.legend()
        # plt.grid(True)
        # plt.show()

    # Chisquare
    for source in st.sources_masks:
        cn = st.sources_cn[source]
        # chi = chisquare()
        # gen = keys_basic.generate_pubkey_mask()

    # 2D Key plot
    if args.plot_key_dist:
        plot_key_mask_dist(masks_db, st)
Ejemplo n.º 11
0
def clustering(k, feature_cols, feature_domains, header, table, seeds,
               result_file):
    best_loglike = None
    best_model = None
    # Giant random seeding loop,

    data = mx.DataSet()
    data.fromArray(table)
    for r in range(1):
        #  weights = np.random.random_sample(k)
        #  weights_norm = weights / sum(weights)
        weights_norm = [1.0 / k] * k
        components = []
        for i in range(k):
            products = []
            for j in range(table.shape[1]):
                col_type = prep.get_col_type(feature_cols[j], header)
                col_id = feature_cols[j]

                if col_type == 'cat':
                    vals = feature_domains[col_id].keys()
                    cnt_vals = len(vals)
                    rand_dist = np.random.random_sample(cnt_vals)

                    dist = mx.DiscreteDistribution(cnt_vals,
                                                   rand_dist / sum(rand_dist),
                                                   mx.Alphabet(vals))

                elif col_type == 'num':
                    min_val = feature_domains[col_id]['min']
                    max_val = feature_domains[col_id]['max']
                    #  mean = random.uniform(min_val, max_val)
                    mean = seeds[header[col_id][0]][i]
                    stdev = (max_val - min_val) / 2.0 / k

                    dist = mx.NormalDistribution(mean, stdev)

                else:
                    sys.exit(1)
                products.append(dist)

            comp = mx.ProductDistribution(products)
            components.append(comp)

        mix_table = mx.MixtureModel(k, weights_norm, components)
        print mix_table

        #loglike = mix_table.randMaxEM(data,1,50,50)
        #print loglike
        #print mix_table
        if not best_loglike or loglike > best_loglike:
            #  best_loglike = loglike
            best_model = copy.copy(mix_table)


#data.internalInit(mix)
# mix_table.modelInitialization(data)
#  print best_loglike
#  print best_model

    labels = best_model.classify(data, None, None, 1)

    ## output clustering results

    # count cluster sizes on sampled data
    f = open(result_file + '.stats', 'w')
    cnt = {}
    for l in labels:
        cnt[l] = 1 if l not in cnt else cnt[l] + 1

    for l in cnt:
        f.write('%s %d %f%%\n' %
                (l, cnt[l], cnt[l] * 100.0 / sum(cnt.values())))
    f.close()

    mx.writeMixture(best_model, result_file + '.model')
    return best_model