Beispiel #1
0
def test_estep():
    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)
    kernel = NoopKernel()
    kernel.build(Trace.shape[0], Count_zh.shape[1], np.zeros(0, dtype='d'))
    
    alpha_zh = .1
    beta_zs = .1

    assert_equal(Count_zh.sum(), 10)
    assert_equal(Count_sz.sum(), 20)
    
    assert_equal(count_h[0], 4)
    assert_equal(count_h[1], 4)
    assert_equal(count_h[2], 2)
    
    new_state = _learn._e_step(tstamps, Trace, previous_stamps, Count_zh, \
            Count_sz, count_h, count_z, alpha_zh, beta_zs, prob_topics_aux, \
            kernel)

    assert_equal(count_h[0], 4)
    assert_equal(count_h[1], 4)
    assert_equal(count_h[2], 2)
    
    assert_equal(Count_zh.sum(), 10)
    assert_equal(Count_sz.sum(), 20)
Beispiel #2
0
def test_em():

    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)
    kernel = NoopKernel()
    kernel.build(Trace.shape[0], Count_zh.shape[1], np.zeros(0, dtype='d'))

    alpha_zh = .1
    beta_zs = .1

    assert (Theta_zh == 0).all()
    assert (Psi_sz == 0).all()

    old_Count_zh = Count_zh.copy()
    old_Count_sz = Count_sz.copy()
    old_count_h = count_h.copy()
    old_count_z = count_z.copy()

    _learn.em(tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, \
            count_z, alpha_zh, beta_zs, prob_topics_aux, Theta_zh, Psi_sz, \
            10, 2, kernel)

    assert (Theta_zh > 0).sum() > 0
    assert (Psi_sz > 0).sum() > 0

    assert_almost_equal(1, Theta_zh.sum(axis=0))
    assert_almost_equal(1, Psi_sz.sum(axis=0))

    assert (old_Count_zh != Count_zh).any()
    assert (old_Count_sz != Count_sz).any()

    assert (old_count_h == count_h).all()  #the count_h should not change
Beispiel #3
0
def test_em():
    
    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)
    kernel = NoopKernel()
    kernel.build(Trace.shape[0], Count_zh.shape[1], np.zeros(0, dtype='d'))
    
    alpha_zh = .1
    beta_zs = .1
    
    assert (Theta_zh == 0).all()
    assert (Psi_sz == 0).all()
    
    old_Count_zh = Count_zh.copy()
    old_Count_sz = Count_sz.copy()
    old_count_h = count_h.copy()
    old_count_z = count_z.copy()

    _learn.em(tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, \
            count_z, alpha_zh, beta_zs, prob_topics_aux, Theta_zh, Psi_sz, \
            10, 2, kernel)
    
    assert (Theta_zh > 0).sum() > 0
    assert (Psi_sz > 0).sum() > 0
    
    assert_almost_equal(1, Theta_zh.sum(axis=0))
    assert_almost_equal(1, Psi_sz.sum(axis=0))

    assert (old_Count_zh != Count_zh).any()
    assert (old_Count_sz != Count_sz).any()
    
    assert (old_count_h == count_h).all() #the count_h should not change
Beispiel #4
0
def test_estep():
    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)
    kernel = NoopKernel()
    kernel.build(Trace.shape[0], Count_zh.shape[1], np.zeros(0, dtype='d'))

    alpha_zh = .1
    beta_zs = .1

    assert_equal(Count_zh.sum(), 10)
    assert_equal(Count_sz.sum(), 20)

    assert_equal(count_h[0], 4)
    assert_equal(count_h[1], 4)
    assert_equal(count_h[2], 2)

    new_state = _learn._e_step(tstamps, Trace, previous_stamps, Count_zh, \
            Count_sz, count_h, count_z, alpha_zh, beta_zs, prob_topics_aux, \
            kernel)

    assert_equal(count_h[0], 4)
    assert_equal(count_h[1], 4)
    assert_equal(count_h[2], 2)

    assert_equal(Count_zh.sum(), 10)
    assert_equal(Count_sz.sum(), 20)
Beispiel #5
0
def fit(trace_fpath, num_topics, alpha_zh, beta_zs, kernel, \
        residency_priors, num_iter, burn_in, from_=0, to=np.inf):
    '''
    Learns the latent topics from a temporal hypergraph trace. 

    Node-Sherlock is a EM algorithm in which the E-Step is a gibbs sample update,
    thus the reason we use a `num_iter` and `burn_in` approach, instead of the 
    usual convergence approach.

    Parameters
    ----------
    trace_fpath : str
        The path of the trace. Each line should be a \
                (timestamp, hypernode, source, destination) where the \
                timestamp is a long (seconds or milliseconds from epoch).

    num_topics : int
        The number of latent spaces to learn

    alpha_zh : float
        The value of the alpha_zh hyperparameter

    beta_zs : float
        The value of the beta_zs (beta) hyperaparameter

    kernel : Kernel object
        The kernel to use

    residency_priors : array of float
        The kernel hyper parameters

    num_iter : int
        The number of iterations to learn the model from

    burn_in : int
        The burn_in of the chain
    
    Returns
    -------
    
    A dictionary with the results.
    '''
    Dts, Trace, previous_stamps, Count_zh, Count_sz, \
            count_h, count_z, prob_topics_aux, Theta_zh, Psi_sz, \
            hyper2id, source2id = \
            dataio.initialize_trace(trace_fpath, num_topics, num_iter, \
            from_, to)
    
    em(Dts, Trace, previous_stamps, Count_zh, \
            Count_sz, count_h, count_z, alpha_zh, beta_zs, \
            prob_topics_aux, Theta_zh, Psi_sz, num_iter, \
            burn_in, kernel)
    
    rv = prepare_results(trace_fpath, num_topics, alpha_zh, beta_zs, \
            kernel, residency_priors, num_iter, burn_in, Dts, Trace, \
            Count_zh, Count_sz, count_h, \
            count_z, prob_topics_aux, Theta_zh, Psi_sz, hyper2id, \
            source2id, from_, to)
    rv['algorithm'] = np.asarray(['serial gibbs + em'])
    return rv
Beispiel #6
0
def test_split():

    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)
    
    alpha_zh, beta_zs, a_ptz, b_ptz = [0.1] * 4
    ll_per_z = np.zeros(2, dtype='f8')

    Trace[:, -1] = 0
    previous_stamps._clear()
    tstamps = np.array([[1.0, 2.0, 3.0, 4.0, 5.0, 100, 200, 300, 400, 500]]).T
    tstamps = np.array(tstamps, order='C')
    previous_stamps._extend(0, tstamps[:, 0])
    
    Count_zh = np.zeros(shape=(1, Count_zh.shape[1]), dtype='i4')
    Count_sz = np.zeros(shape=(Count_sz.shape[0], 1), dtype='i4')
    count_z = np.zeros(shape=(1, ), dtype='i4')
    
    fast_populate(Trace, Count_zh, Count_sz, count_h, count_z)
    kernel = ECCDFKernel()
    kernel.build(Trace.shape[0], Count_zh.shape[0], \
            np.array([1.0, Count_zh.shape[0] - 1]))

    ll_per_z = np.zeros(1, dtype='f8')

    quality_estimate(tstamps, Trace, \
                previous_stamps, Count_zh, Count_sz, count_h, \
                count_z, alpha_zh, beta_zs, \
                ll_per_z, np.arange(Trace.shape[0], dtype='i4'), \
                kernel)
    
    Trace_new, Count_zh_new, Count_sz_new, count_z_new, \
            new_stamps, _ = \
            dynamic.split(tstamps, Trace, previous_stamps, Count_zh, Count_sz, \
            count_h, count_z, alpha_zh, beta_zs, \
            ll_per_z, kernel, .5, 0)
    
    assert_array_equal(Trace_new[:, -1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
    assert_array_equal(new_stamps._get_all(0), [1, 2, 3, 4, 5])
    assert_array_equal(new_stamps._get_all(1), [100, 200, 300, 400, 500])
   
    assert Count_zh_new.shape[0] > Count_zh.shape[0]
    assert Count_zh_new.shape[1] == Count_zh.shape[1]
    assert Count_zh_new[0].sum() == 5
    assert Count_zh_new[1].sum() == 5

    assert Count_sz_new.shape[0] == Count_sz.shape[0]
    assert Count_sz_new.shape[1] > Count_sz.shape[1]
    assert Count_sz_new[:, 0].sum() == 10
    assert Count_sz_new[:, 1].sum() == 10

    assert count_z_new.shape[0] > count_z.shape[0]
    assert count_z_new[0] == 10
    assert count_z_new[1] == 10
def test_split():

    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)

    alpha_zh, beta_zs, a_ptz, b_ptz = [0.1] * 4
    ll_per_z = np.zeros(2, dtype='f8')

    Trace[:, -1] = 0
    previous_stamps._clear()
    tstamps = np.array([[1.0, 2.0, 3.0, 4.0, 5.0, 100, 200, 300, 400, 500]]).T
    tstamps = np.array(tstamps, order='C')
    previous_stamps._extend(0, tstamps[:, 0])

    Count_zh = np.zeros(shape=(1, Count_zh.shape[1]), dtype='i4')
    Count_sz = np.zeros(shape=(Count_sz.shape[0], 1), dtype='i4')
    count_z = np.zeros(shape=(1, ), dtype='i4')

    fast_populate(Trace, Count_zh, Count_sz, count_h, count_z)
    kernel = ECCDFKernel()
    kernel.build(Trace.shape[0], Count_zh.shape[0], \
            np.array([1.0, Count_zh.shape[0] - 1]))

    ll_per_z = np.zeros(1, dtype='f8')

    quality_estimate(tstamps, Trace, \
                previous_stamps, Count_zh, Count_sz, count_h, \
                count_z, alpha_zh, beta_zs, \
                ll_per_z, np.arange(Trace.shape[0], dtype='i4'), \
                kernel)

    Trace_new, Count_zh_new, Count_sz_new, count_z_new, \
            new_stamps, _ = \
            dynamic.split(tstamps, Trace, previous_stamps, Count_zh, Count_sz, \
            count_h, count_z, alpha_zh, beta_zs, \
            ll_per_z, kernel, .5, 0)

    assert_array_equal(Trace_new[:, -1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
    assert_array_equal(new_stamps._get_all(0), [1, 2, 3, 4, 5])
    assert_array_equal(new_stamps._get_all(1), [100, 200, 300, 400, 500])

    assert Count_zh_new.shape[0] > Count_zh.shape[0]
    assert Count_zh_new.shape[1] == Count_zh.shape[1]
    assert Count_zh_new[0].sum() == 5
    assert Count_zh_new[1].sum() == 5

    assert Count_sz_new.shape[0] == Count_sz.shape[0]
    assert Count_sz_new.shape[1] > Count_sz.shape[1]
    assert Count_sz_new[:, 0].sum() == 10
    assert Count_sz_new[:, 1].sum() == 10

    assert count_z_new.shape[0] > count_z.shape[0]
    assert count_z_new[0] == 10
    assert count_z_new[1] == 10
def test_correlate_all():
    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)

    C = dynamic.correlate_counts(Count_zh, Count_sz, count_h, count_z, .1, \
            .1)

    assert_equal((2, 2), C.shape)
    assert C[0, 1] != 0
    assert (np.tril(C) == 0).all()
Beispiel #9
0
def test_correlate_all():
    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)
 
    C = dynamic.correlate_counts(Count_zh, Count_sz, count_h, count_z, .1, \
            .1)

    assert_equal((2, 2), C.shape)
    assert C[0, 1] != 0
    assert (np.tril(C) == 0).all()
Beispiel #10
0
def test_merge():
    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)
     
    kernel = ECCDFKernel()
    kernel.build(Trace.shape[0], Count_zh.shape[0], \
            np.array([1.0, Count_zh.shape[0] - 1]))
    
    Trace[:, 0] = 0
    Trace[:, 1] = 0
    Trace[:, 2] = 0
    Trace[:, 3] = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

    Count_sz[:] = 0
    Count_zh[:] = 0
    count_z[:] = 0
    count_h[:] = 0
    
    fast_populate(Trace, Count_zh, Count_sz, count_h, count_z)
    C = dynamic.correlate_counts(Count_zh, Count_sz, count_h, count_z, .1, \
            .1)
    
    alpha_zh, beta_zs, beta_zd = [0.1] * 3
    a_ptz = 1.0
    b_ptz = Count_zh.shape[0] - 1

    ll_per_z = np.zeros(2, dtype='f8')

    quality_estimate(tstamps, Trace, \
                previous_stamps, Count_zh, Count_sz, count_h, \
                count_z, alpha_zh, beta_zd, ll_per_z, \
                np.arange(Trace.shape[0], dtype='i4'), kernel)
    
    Trace_new, Count_zh_new, Count_sz_new, \
            count_z_new, new_stamps, _ = \
            dynamic.merge(tstamps, Trace, previous_stamps, Count_zh, Count_sz, \
            count_h, count_z, alpha_zh, beta_zs, ll_per_z, kernel)
    
    print(Trace_new)
    print(np.array(new_stamps._get_all(0)))
    assert len(new_stamps._get_all(0)) == 10
    assert Count_zh_new.shape[0] < Count_zh.shape[0]
    assert Count_zh_new.shape[1] == Count_zh.shape[1]

    assert Count_sz_new.shape[0] == Count_sz.shape[0]
    assert Count_sz_new.shape[1] < Count_sz.shape[0]

    assert count_z_new.shape[0] < count_z.shape[0]
def test_merge():
    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)

    kernel = ECCDFKernel()
    kernel.build(Trace.shape[0], Count_zh.shape[0], \
            np.array([1.0, Count_zh.shape[0] - 1]))

    Trace[:, 0] = 0
    Trace[:, 1] = 0
    Trace[:, 2] = 0
    Trace[:, 3] = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

    Count_sz[:] = 0
    Count_zh[:] = 0
    count_z[:] = 0
    count_h[:] = 0

    fast_populate(Trace, Count_zh, Count_sz, count_h, count_z)
    C = dynamic.correlate_counts(Count_zh, Count_sz, count_h, count_z, .1, \
            .1)

    alpha_zh, beta_zs, beta_zd = [0.1] * 3
    a_ptz = 1.0
    b_ptz = Count_zh.shape[0] - 1

    ll_per_z = np.zeros(2, dtype='f8')

    quality_estimate(tstamps, Trace, \
                previous_stamps, Count_zh, Count_sz, count_h, \
                count_z, alpha_zh, beta_zd, ll_per_z, \
                np.arange(Trace.shape[0], dtype='i4'), kernel)

    Trace_new, Count_zh_new, Count_sz_new, \
            count_z_new, new_stamps, _ = \
            dynamic.merge(tstamps, Trace, previous_stamps, Count_zh, Count_sz, \
            count_h, count_z, alpha_zh, beta_zs, ll_per_z, kernel)

    print(Trace_new)
    print(np.array(new_stamps._get_all(0)))
    assert len(new_stamps._get_all(0)) == 10
    assert Count_zh_new.shape[0] < Count_zh.shape[0]
    assert Count_zh_new.shape[1] == Count_zh.shape[1]

    assert Count_sz_new.shape[0] == Count_sz.shape[0]
    assert Count_sz_new.shape[1] < Count_sz.shape[0]

    assert count_z_new.shape[0] < count_z.shape[0]
Beispiel #12
0
def test_sample():
    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)
    kernel = NoopKernel()
    kernel.build(Trace.shape[0], Count_zh.shape[1], np.zeros(0, dtype='d'))

    tstamp_idx = 3
    hyper = Trace[tstamp_idx, 0]
    source = Trace[tstamp_idx, 1]
    dest = Trace[tstamp_idx, 2]
    old_topic = Trace[tstamp_idx, 3]

    new_topic = _learn._sample(tstamp_idx, tstamps, Trace, \
            previous_stamps, Count_zh, Count_sz, count_h, \
            count_z, .1, .1, prob_topics_aux, kernel)

    assert new_topic <= 3
Beispiel #13
0
def test_sample():
    tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)
    kernel = NoopKernel()
    kernel.build(Trace.shape[0], Count_zh.shape[1], np.zeros(0, dtype='d'))

    tstamp_idx = 3
    hyper = Trace[tstamp_idx, 0]
    source = Trace[tstamp_idx, 1]
    dest = Trace[tstamp_idx, 2]
    old_topic = Trace[tstamp_idx, 3]

    new_topic = _learn._sample(tstamp_idx, tstamps, Trace, \
            previous_stamps, Count_zh, Count_sz, count_h, \
            count_z, .1, .1, prob_topics_aux, kernel)
    
    assert new_topic <= 3
Beispiel #14
0
def test_initialize():
    tstamps, Trace, previous_stamps, Count_zh, Count_oz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_oz, hyper2id, obj2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)

    assert_equal(len(hyper2id), 3)
    assert_equal(len(obj2id), 6)

    assert_equal(Trace.shape[0], 10)
    assert_equal(Trace.shape[1], 4)

    for z in [0, 1]:
        assert previous_stamps._size(z) > 0

    assert_equal(count_h[0], 4)
    assert_equal(count_h[1], 4)
    assert_equal(count_h[2], 2)

    assert_equal(count_z.sum(), 20)  #depends on memory

    #We can only test shapes and sum, since assignments are random
    assert_equal(Count_zh.shape, (2, 3))
    assert_equal(Count_oz.shape, (6, 2))

    assert_equal(Count_zh.sum(), 10)
    assert_equal(Count_oz.sum(), 20)

    assert (prob_topics_aux == 0).all()

    #Simple sanity check on topic assigmnets. Check if topics have valid
    #ids and if count matches count matrix
    from collections import Counter
    c = Counter(Trace[:, -1])
    for topic in c:
        assert topic in [0, 1]
        assert c[topic] == count_z[topic] / 2
Beispiel #15
0
def test_initialize():
    tstamps, Trace, previous_stamps, Count_zh, Count_oz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_oz, hyper2id, obj2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10)
    
    assert_equal(len(hyper2id), 3)
    assert_equal(len(obj2id), 6) 
    
    assert_equal(Trace.shape[0], 10)
    assert_equal(Trace.shape[1], 4)
    
    for z in [0, 1]:
        assert previous_stamps._size(z) > 0

    assert_equal(count_h[0], 4)
    assert_equal(count_h[1], 4)
    assert_equal(count_h[2], 2)
    
    assert_equal(count_z.sum(), 20) #depends on memory
    
    #We can only test shapes and sum, since assignments are random
    assert_equal(Count_zh.shape, (2, 3))
    assert_equal(Count_oz.shape, (6, 2))
    
    assert_equal(Count_zh.sum(), 10)
    assert_equal(Count_oz.sum(), 20)
    
    assert (prob_topics_aux == 0).all()

    #Simple sanity check on topic assigmnets. Check if topics have valid
    #ids and if count matches count matrix        
    from collections import Counter
    c = Counter(Trace[:, -1])
    for topic in c:
        assert topic in [0, 1]
        assert c[topic] == count_z[topic] / 2
Beispiel #16
0
def fit(trace_fpath, num_topics, alpha_zh, beta_zs, kernel, \
        residency_priors, num_iter, num_batches, mpi_mode, from_=0, to=np.inf):
    '''
    Learns the latent topics from a temporal hypergraph trace. Here we do a
    asynchronous learning of the topics similar to AD-LDA, as well as the 
    dynamic topic expansion/pruing.

    Parameters
    ----------
    trace_fpath : str
        The path of the trace. Each line should be a \
                (timestamp, hypernode, source, destination) where the \
                timestamp is a long (seconds or milliseconds from epoch).

    num_topics : int
        The number of latent spaces to learn

    alpha_zh : float
        The value of the alpha_zh hyperparameter

    beta_zs : float
        The value of the beta_zs (beta) hyperaparameter

    kernel : Kernel object
        The kernel to use

    residency_priors : array of float
        The kernel hyper parameters

    num_iter : int
        The number of iterations to learn the model from
    
    num_batches : int
        Defines the number of batches of size num_iter 

    Returns
    -------
    
    A dictionary with the results.
    '''
    assert num_batches >= 2 
    comm = MPI.COMM_WORLD
    num_workers = comm.size - 1

    Dts, Trace, previous_stamps, Count_zh, Count_sz, \
            count_h, count_z, prob_topics_aux, Theta_zh, Psi_sz, \
            hyper2id, source2id = \
            dataio.initialize_trace(trace_fpath, num_topics, num_iter, \
            from_, to)
    
    if mpi_mode:
        workloads = generate_workload(Count_zh.shape[1], num_workers, Trace)
    all_idx = np.arange(Trace.shape[0], dtype='i4')
    
    for batch in xrange(num_batches):
        print('Now at batch', batch)
        if mpi_mode:
            for worker_id in xrange(1, num_workers + 1):
                comm.send(num_iter, dest=worker_id, tag=Msg.LEARN.value)

            dispatch_jobs(Dts, Trace, Count_zh, Count_sz, \
                    count_h, count_z, alpha_zh, beta_zs, kernel, \
                    residency_priors, workloads, num_workers, comm)
            manage(comm, num_workers)
            fetch_results(comm, num_workers, workloads, Dts, Trace, \
                    previous_stamps, Count_zh, Count_sz, count_h, \
                    count_z, alpha_zh, beta_zs, Theta_zh, Psi_sz, \
                    kernel)
        else:
            prob_topics_aux = np.zeros(Count_zh.shape[0], dtype='f8')
            _learn.em(Dts, Trace, previous_stamps, Count_zh, Count_sz, \
                    count_h, count_z, alpha_zh, beta_zs, \
                    prob_topics_aux, Theta_zh, Psi_sz, num_iter, \
                    num_iter * 2, kernel, False)
        
        print('Split')
        ll_per_z = np.zeros(count_z.shape[0], dtype='f8')
        _eval.quality_estimate(Dts, Trace, previous_stamps, \
                Count_zh, Count_sz, count_h, count_z, alpha_zh, \
                beta_zs, ll_per_z, all_idx, kernel)
        Trace, Count_zh, Count_sz, count_z, previous_stamps, \
                P = split(Dts, Trace, previous_stamps, Count_zh, \
                Count_sz, count_h, count_z, alpha_zh, beta_zs, \
                ll_per_z, kernel)
        kernel = kernel.__class__()
        kernel.build(Trace.shape[0], Count_zh.shape[0], residency_priors)
        if residency_priors.shape[0] > 0:
            kernel.update_state(P)

        print('Merge')
        ll_per_z = np.zeros(count_z.shape[0], dtype='f8')
        _eval.quality_estimate(Dts, Trace, previous_stamps, \
                Count_zh, Count_sz, count_h, count_z, alpha_zh, \
                beta_zs, ll_per_z, all_idx, kernel)
        Trace, Count_zh, Count_sz, count_z, previous_stamps, \
                P = merge(Dts, Trace, previous_stamps, Count_zh, \
                Count_sz, count_h, count_z, alpha_zh, beta_zs, \
                ll_per_z, kernel)
        kernel = kernel.__class__()
        kernel.build(Trace.shape[0], Count_zh.shape[0], residency_priors)
        if residency_priors.shape[0] > 0:
            kernel.update_state(P)
 
	Theta_zh = np.zeros(shape=Count_zh.shape, dtype='f8')
	Psi_sz = np.zeros(shape=Count_sz.shape, dtype='f8')
	if batch == num_batches - 1:
            print('Computing probs')
    	    _learn._aggregate(Count_zh, Count_sz, count_h, count_z, \
                alpha_zh, beta_zs, Theta_zh, Psi_sz)
        print('New nz', Count_zh.shape[0]) 
    if mpi_mode:
        for worker_id in xrange(1, num_workers + 1):
            comm.send(num_iter, dest=worker_id, tag=Msg.STOP.value)
    
    rv = prepare_results(trace_fpath, num_topics, alpha_zh, beta_zs, \
            kernel, residency_priors, num_iter, -1, Dts, Trace, \
            Count_zh, Count_sz, count_h, count_z, prob_topics_aux, Theta_zh, \
            Psi_sz, hyper2id, source2id, from_, to)

    rv['num_workers'] = np.asarray([num_workers])
    rv['num_batches'] = np.asarray([num_batches])
    rv['algorithm'] = np.asarray(['parallel dynamic'])
    return rv
Beispiel #17
0
def test_initialize_limits():
    tstamps, Trace, previous_stamps, Count_zh, Count_oz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_oz, hyper2id, obj2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10, 2, 5)

    assert len(tstamps) == 3
Beispiel #18
0
def fit(trace_fpath, num_topics, alpha_zh, beta_zs, kernel, residency_priors, \
        num_iter, from_=0, to=np.inf):
    '''
    Learns the latent topics from a temporal hypergraph trace. Here we do a
    asynchronous learning of the topics similar to AD-LDA. An even number of
    threads is required.

    Parameters
    ----------
    trace_fpath : str
        The path of the trace. Each line should be a
                (timestamp, hypernode, source, destination) where the
                timestamp is a long (seconds or milliseconds from epoch).

    num_topics : int
        The number of latent spaces to learn

    alpha_zh : float
        The value of the alpha_zh hyperparameter

    beta_zs : float
        The value of the beta_zs (beta) hyperaparameter

    kernel : Kernel object
        The kernel to use

    residency_priors : array of float
        The kernel hyper parameters

    num_iter : int
        The number of iterations to learn the model from

    Returns
    -------
    
    A dictionary with the results.
    '''
    comm = MPI.COMM_WORLD
    num_workers = comm.size - 1

    Dts, Trace, previous_stamps, Count_zh, Count_sz, \
            count_h, count_z, prob_topics_aux, Theta_zh, Psi_sz, \
            hyper2id, source2id = \
            dataio.initialize_trace(trace_fpath, num_topics, num_iter, \
            from_, to)
    for worker_id in xrange(1, num_workers + 1):
        comm.send(num_iter, dest=worker_id, tag=Msg.LEARN.value)
    
    workloads = generate_workload(Count_zh.shape[1], num_workers, Trace)
    dispatch_jobs(Dts, Trace, Count_zh, Count_sz, count_h, \
            count_z, alpha_zh, beta_zs, kernel, residency_priors, \
            workloads, num_workers, comm)
    manage(comm, num_workers)
    fetch_results(comm, num_workers, workloads, Dts, Trace, previous_stamps, \
            Count_zh, Count_sz, count_h, count_z, alpha_zh, \
            beta_zs, Theta_zh, Psi_sz, kernel)

    for worker_id in xrange(1, num_workers + 1):
        comm.send(worker_id, dest=worker_id, tag=Msg.STOP.value)

    rv = prepare_results(trace_fpath, num_topics, alpha_zh, beta_zs, \
            kernel, residency_priors, num_iter, -1, Dts, \
            Trace, Count_zh, Count_sz, count_h, \
            count_z, prob_topics_aux, Theta_zh, Psi_sz, hyper2id, \
            source2id, from_, to)

    rv['num_workers'] = np.asarray([num_workers])
    rv['algorithm'] = np.asarray(['parallel gibbs + em'])
    return rv
Beispiel #19
0
def fit(trace_fpath, num_topics, alpha_zh, beta_zs, kernel, residency_priors, \
        num_iter, from_=0, to=np.inf):
    '''
    Learns the latent topics from a temporal hypergraph trace. Here we do a
    asynchronous learning of the topics similar to AD-LDA. An even number of
    threads is required.

    Parameters
    ----------
    trace_fpath : str
        The path of the trace. Each line should be a
                (timestamp, hypernode, source, destination) where the
                timestamp is a long (seconds or milliseconds from epoch).

    num_topics : int
        The number of latent spaces to learn

    alpha_zh : float
        The value of the alpha_zh hyperparameter

    beta_zs : float
        The value of the beta_zs (beta) hyperaparameter

    kernel : Kernel object
        The kernel to use

    residency_priors : array of float
        The kernel hyper parameters

    num_iter : int
        The number of iterations to learn the model from

    Returns
    -------
    
    A dictionary with the results.
    '''
    comm = MPI.COMM_WORLD
    num_workers = comm.size - 1

    Dts, Trace, previous_stamps, Count_zh, Count_sz, \
            count_h, count_z, prob_topics_aux, Theta_zh, Psi_sz, \
            hyper2id, source2id = \
            dataio.initialize_trace(trace_fpath, num_topics, num_iter, \
            from_, to)
    for worker_id in xrange(1, num_workers + 1):
        comm.send(num_iter, dest=worker_id, tag=Msg.LEARN.value)
    
    workloads = generate_workload(Count_zh.shape[1], num_workers, Trace)
    dispatch_jobs(Dts, Trace, Count_zh, Count_sz, count_h, \
            count_z, alpha_zh, beta_zs, kernel, residency_priors, \
            workloads, num_workers, comm)
    manage(comm, num_workers)
    fetch_results(comm, num_workers, workloads, Dts, Trace, previous_stamps, \
            Count_zh, Count_sz, count_h, count_z, alpha_zh, \
            beta_zs, Theta_zh, Psi_sz, kernel)

    for worker_id in xrange(1, num_workers + 1):
        comm.send(worker_id, dest=worker_id, tag=Msg.STOP.value)

    rv = prepare_results(trace_fpath, num_topics, alpha_zh, beta_zs, \
            kernel, residency_priors, num_iter, -1, Dts, \
            Trace, Count_zh, Count_sz, count_h, \
            count_z, prob_topics_aux, Theta_zh, Psi_sz, hyper2id, \
            source2id, from_, to)

    rv['num_workers'] = np.asarray([num_workers])
    rv['algorithm'] = np.asarray(['parallel gibbs + em'])
    return rv
Beispiel #20
0
def test_initialize_limits():
    tstamps, Trace, previous_stamps, Count_zh, Count_oz, count_h, count_z, \
            prob_topics_aux, Theta_zh, Psi_oz, hyper2id, obj2id = \
            dataio.initialize_trace(files.SIZE10, 2, 10, 2, 5)
    
    assert len(tstamps) == 3