Ejemplo n.º 1
0
def finalize_splits(nz, n_splits, splitted, Dts, Trace, nh, ns, kernel):
    
    new_nz = nz + n_splits
    if kernel.get_priors().shape[0] > 0:
        new_P = [row for row in kernel.get_state()]
        for _ in xrange(n_splits):
            new_P.append(kernel.get_priors())
    else:
        new_P = kernel.get_state()

    Trace[:, -1] = splitted
    
    #Populate new counts
    Count_zh_new = np.zeros(shape=(new_nz, nh), dtype='i4')
    Count_sz_new = np.zeros(shape=(ns, new_nz), dtype='i4')
    count_z_new = np.zeros(new_nz, dtype='i4')
    count_h_new = np.zeros(nh, dtype='i4')
    
    _learn.fast_populate(Trace, Count_zh_new, Count_sz_new, \
            count_h_new, count_z_new)
    
    new_stamps = StampLists(new_nz)
    for z in xrange(new_nz):
        idx = Trace[:, -1] == z
        topic_stamps = Dts[idx]
        new_stamps._extend(z, topic_stamps[:, -1])

    return Trace, Count_zh_new, Count_sz_new, \
            count_z_new, new_stamps, np.array(new_P)
Ejemplo n.º 2
0
def test_all():
    
    slists = StampLists(4)
    E = []
    for i in xrange(4):
        E.append([])
        for _ in xrange(200):
            e = np.random.rand()
            slists._append(i, e)
            E[i].append(e)

    kern = ECCDFKernel(True)
    kern.build(800, 4, np.array([1.0, 3.0]))

    kern._mstep(slists)

    for i in xrange(4):
        assert_array_equal(sorted(E[i]), slists._get_all(i))


    for i in xrange(4):
        for _ in xrange(200):
            p = kern._pdf(i, np.random.rand(), slists)
            assert p <= 1
            assert p >= 0
Ejemplo n.º 3
0
def test_pdf():
    for mu in [0, 1, 10]:
        for v in [1, 10]:
            for std in [1, 10]:
                priors = np.array([mu, v, std], dtype='d')
                kernel = TStudentKernel()
                kernel.build(999, 1, priors)  #99... is just the max freed

                truth = t(v, loc=mu, scale=std)
                for x in np.linspace(-100, 100, 200):
                    print(mu, v, std, x, truth.pdf(x),
                          kernel._pdf(x, 0, StampLists(1)))
                    assert_almost_equal(truth.pdf(x), \
                            kernel._pdf(x, 0, StampLists(1)))
def main(model, out_fpath_rrs, out_fpath_pred):
    store = pd.HDFStore(model)
    trace_fpath = store['trace_fpath'][0][0]
    tsFile = open(
        '/home/zahran/Desktop/tribeFlow/zahranData/lastfm-dataset-1K/PARSED_74123_B10_zahran_sampledData',
        'r')
    sequenceLength = 10

    from_ = store['from_'][0][0]
    to = store['to'][0][0]
    assert from_ == 0

    kernel_class = store['kernel_class'][0][0]
    kernel_class = eval(kernel_class)

    Theta_zh = store['Theta_zh'].values
    Psi_sz = store['Psi_sz'].values
    count_z = store['count_z'].values[:, 0]
    P = store['P'].values
    residency_priors = store['residency_priors'].values[:, 0]
    previous_stamps = StampLists(
        count_z.shape[0])  #previous_stamps has a length = nz

    true_mem_size = store['Dts'].values.shape[1]
    tstamps = store[
        'Dts'].values[:,
                      0]  #tstamps (#trainingLines,) contains the  t(xi)-t(xi-1)
    assign = store[
        'assign'].values[:,
                         0]  # assign (#trainingLines,) each dim has the env id used in that training instance
    for z in xrange(count_z.shape[0]):
        idx = assign == z
        previous_stamps._extend(
            z, tstamps[idx]
        )  #tstamps[idx]: is the tstamps whose corresponding index in idx is True

    hyper2id = dict(store['hyper2id'].values)
    obj2id = dict(store['source2id'].values)
    trace_size = sum(count_z)  #sum of the number of appearances of all envs
    kernel = kernel_class()
    kernel.build(trace_size, count_z.shape[0], residency_priors)
    kernel.update_state(P)

    with open(trace_fpath) as traceFile:
        predictObject(store, sequenceLength, tsFile, traceFile, true_mem_size,
                      hyper2id, obj2id, previous_stamps, Theta_zh, Psi_sz,
                      count_z, kernel)
    tsFile.close()
    traceFile.close()
Ejemplo n.º 5
0
def finalize_merge(nz, to_merge, Dts, Trace, nh, ns, kernel):
    
    for z1, z2 in to_merge:
        idx = Trace[:, -1] == z2
        Trace[:, -1][idx] = z1
    
    if to_merge and kernel.get_priors().shape[0] > 0:
        new_P_dict = dict((i, row) for i, row in enumerate(kernel.get_state()))
        for z1, z2 in to_merge:
            del new_P_dict[z2]

        new_P = []
        for i in sorted(new_P_dict):
            new_P.append(new_P_dict[i])
    else:
        new_P = kernel.get_state()

    #Make sure new trace has contiguous ids
    new_assign = Trace[:, -1].copy()
    old_assign = Trace[:, -1].copy()
    if to_merge:
        new_nz = len(set(new_assign))
        for i, z in enumerate(set(new_assign)):
            idx = old_assign == z
            new_assign[idx] = i
    else:
        new_nz = nz
    Trace[:, -1] = new_assign

    #Populate new counts
    Count_zh_new = np.zeros(shape=(new_nz, nh), dtype='i4')
    Count_sz_new = np.zeros(shape=(ns, new_nz), dtype='i4')
    count_z_new = np.zeros(new_nz, dtype='i4')
    count_h_new = np.zeros(nh, dtype='i4')

    _learn.fast_populate(Trace, Count_zh_new, Count_sz_new, \
            count_h_new, count_z_new)
    
    new_stamps = StampLists(new_nz)
    for z in xrange(new_nz):
        idx = Trace[:, -1] == z
        topic_stamps = Dts[idx]
        new_stamps._extend(z, topic_stamps[:, -1])

    return Trace, Count_zh_new, Count_sz_new, \
            count_z_new, new_stamps, np.array(new_P)
Ejemplo n.º 6
0
def sample(Dts, Trace, Count_zh, Count_sz_local, \
        count_h, count_z, alpha_zh, beta_zs, kernel, num_iter, comm):
    
    previous_encounters_s = {}
    for other_processor in xrange(1, comm.size):
        previous_encounters_s[other_processor] = np.zeros_like(Count_sz_local)

    stamps = StampLists(Count_zh.shape[0])
    for z in xrange(Count_zh.shape[0]):
        idx = Trace[:, -1] == z
        #dts_assigned = Dts[idx:, 0].ravel().copy()
        #np.sort(dts_assigned)
        stamps._extend(z, Dts[idx][:, -1])
    
    aux = np.zeros(Count_zh.shape[0], dtype='f8')
 
    Count_sz_pair = np.zeros_like(Count_sz_local)
    Count_sz_others = np.zeros_like(Count_sz_local)
    Count_sz_sum = np.zeros_like(Count_sz_local)

    Theta_zh = np.zeros_like(Count_zh, dtype='f8')
    Psi_sz = np.zeros_like(Count_sz_local, dtype='f8')
    
    can_pair = True
    for i in xrange(num_iter // CACHE_SIZE):
        #Sample from the local counts and encountered counts
        Count_sz_sum[:] = Count_sz_local + Count_sz_others
        count_z[:] = Count_sz_sum.sum(axis=0)
        
        em(Dts, Trace, stamps, Count_zh, Count_sz_sum, \
                count_h, count_z, alpha_zh, beta_zs, aux, Theta_zh, \
                Psi_sz, CACHE_SIZE, CACHE_SIZE * 2, kernel, False)

        #Update local counts
        Count_sz_local[:] = Count_sz_sum - Count_sz_others
        count_z[:] = Count_sz_local.sum(axis=0)

        #Update expected belief of other processors
        if can_pair:
            P_local = kernel.get_state()
            can_pair = paired_update(comm, previous_encounters_s, \
                    Count_sz_local, Count_sz_pair, Count_sz_others, \
                    P_local, np.zeros_like(P_local))
            kernel.update_state(P_local)
Ejemplo n.º 7
0
def sample(Dts, Trace, Count_zh, Count_sz_local, \
        count_h, count_z, alpha_zh, beta_zs, kernel, num_iter, comm):
    
    previous_encounters_s = {}
    for other_processor in xrange(1, comm.size):
        previous_encounters_s[other_processor] = np.zeros_like(Count_sz_local)

    stamps = StampLists(Count_zh.shape[0])
    for z in xrange(Count_zh.shape[0]):
        idx = Trace[:, -1] == z
        #dts_assigned = Dts[idx:, 0].ravel().copy()
        #np.sort(dts_assigned)
        stamps._extend(z, Dts[idx][:, -1])
    
    aux = np.zeros(Count_zh.shape[0], dtype='f8')
 
    Count_sz_pair = np.zeros_like(Count_sz_local)
    Count_sz_others = np.zeros_like(Count_sz_local)
    Count_sz_sum = np.zeros_like(Count_sz_local)

    Theta_zh = np.zeros_like(Count_zh, dtype='f8')
    Psi_sz = np.zeros_like(Count_sz_local, dtype='f8')
    
    can_pair = True
    for i in xrange(num_iter // CACHE_SIZE):
        #Sample from the local counts and encountered counts
        Count_sz_sum[:] = Count_sz_local + Count_sz_others
        count_z[:] = Count_sz_sum.sum(axis=0)
        
        em(Dts, Trace, stamps, Count_zh, Count_sz_sum, \
                count_h, count_z, alpha_zh, beta_zs, aux, Theta_zh, \
                Psi_sz, CACHE_SIZE, CACHE_SIZE * 2, kernel, False)

        #Update local counts
        Count_sz_local[:] = Count_sz_sum - Count_sz_others
        count_z[:] = Count_sz_local.sum(axis=0)

        #Update expected belief of other processors
        if can_pair:
            P_local = kernel.get_state()
            can_pair = paired_update(comm, previous_encounters_s, \
                    Count_sz_local, Count_sz_pair, Count_sz_others, \
                    P_local, np.zeros_like(P_local))
            kernel.update_state(P_local)
Ejemplo n.º 8
0
def split(Dts, Trace, previous_stamps, Count_zh, Count_sz, \
        count_h, count_z, alpha_zh, beta_zs, ll_per_z, kernel, \
        perc=0.05, min_stamps=50):
    
    nz = Count_zh.shape[0]
    nh = Count_zh.shape[1]
    ns = Count_sz.shape[0]
    
    assert nz == ll_per_z.shape[0]
    idx_int_all = np.arange(Trace.shape[0], dtype='i4')
    
    #Initiate auxiliary matrices
    Count_zh_spl = np.zeros(shape=(nz + 1, nh), dtype='i4')
    Count_sz_spl = np.zeros(shape=(ns, nz + 1), dtype='i4')
    count_z_spl = np.zeros(nz + 1, dtype='i4')

    Count_zh_spl[:-1, :] = Count_zh
    Count_sz_spl[:, :-1] = Count_sz
    count_z_spl[:-1] = count_z

    ll_per_z_new = np.zeros(nz + 1, dtype='f8')
    ll_per_z_new[:-1] = ll_per_z

    new_stamps = StampLists(nz + 1) 
    for z in xrange(nz):
        new_stamps._extend(z, previous_stamps._get_all(z))
    
    splitted = Trace[:, -1].copy()
    shift = 0

    #Do the splits per topic
    for z in xrange(nz):
        #Candidates for removal
        topic_stamps = np.asanyarray(previous_stamps._get_all(z))
        idx = Trace[:, -1] == z
        assert topic_stamps.shape[0] == idx.sum()
        
        argsrt = topic_stamps.argsort()
        top = int(np.ceil(perc * topic_stamps.shape[0]))
        
        #If not at least min stamps, exit, not enough for a CCDF estimation
        if top < min_stamps:
            continue
        
        #Populate stamps
        new_stamps._clear_one(z)
        new_stamps._clear_one(nz)
        new_stamps._extend(z, topic_stamps[:-top])
        new_stamps._extend(nz, topic_stamps[-top:])
        
        #Split topic on the Trace. The trace has to be sorted by timestamp!!
        old_assign = Trace[:, -1][idx].copy()
        new_assign = Trace[:, -1][idx].copy()
        new_assign[-top:] = nz
        Trace[:, -1][idx] = new_assign
        
        #Update matrices. Can't really vectorize this :(
        for line in Trace[idx][-top:]:
            h = line[0]

            Count_zh_spl[z, h] -= 1
            for o in line[1:-1]:
                Count_sz_spl[o, z] -= 1
                count_z_spl[z] -= 1
            
            Count_zh_spl[nz, h] += 1
            for o in line[1:-1]:
                Count_sz_spl[o, nz] += 1
                count_z_spl[nz] += 1

        #New LL
        ll_per_z_new[z] = 0
        ll_per_z_new[-1] = 0
        
        idx_int = idx_int_all[idx]
        _eval.quality_estimate(Dts, Trace, \
                new_stamps, Count_zh_spl, Count_sz_spl, count_h, \
                count_z_spl, alpha_zh, beta_zs, \
                ll_per_z_new, idx_int, kernel)
        
        if ll_per_z_new.sum() > ll_per_z.sum():
            new_assign[-top:] = nz + shift
            splitted[idx] = new_assign
            shift += 1

        #Revert trace
        new_stamps._clear_one(z)
        new_stamps._clear_one(nz)
        new_stamps._extend(z, previous_stamps._get_all(z))

        Count_zh_spl[:-1, :] = Count_zh
        Count_sz_spl[:, :-1] = Count_sz
        count_z_spl[:-1] = count_z

        Count_zh_spl[-1, :] = 0
        Count_sz_spl[:, -1] = 0
        count_z_spl[-1] = 0
        
        ll_per_z_new[z] = ll_per_z[z]
        ll_per_z_new[-1] = 0
        Trace[:, -1][idx] = old_assign
    
    return finalize_splits(nz, shift, splitted, Dts, Trace, nh, ns, kernel)
Ejemplo n.º 9
0
def main(model, out_fpath_rrs, out_fpath_pred):
    store = pd.HDFStore(model)
    
    from_ = store['from_'][0][0]
    to = store['to'][0][0]
    assert from_ == 0
    
    trace_fpath = store['trace_fpath'][0][0]
    kernel_class = store['kernel_class'][0][0]
    kernel_class = eval(kernel_class)

    Theta_zh = store['Theta_zh'].values
    Psi_sz = store['Psi_sz'].values
    count_z = store['count_z'].values[:, 0]
    P = store['P'].values
    residency_priors = store['residency_priors'].values[:, 0]
    
    previous_stamps = StampLists(count_z.shape[0])

    mem_size = store['Dts'].values.shape[1]
    tstamps = store['Dts'].values[:, 0]
    assign = store['assign'].values[:, 0]
    for z in xrange(count_z.shape[0]):
        idx = assign == z
        previous_stamps._extend(z, tstamps[idx])

    hyper2id = dict(store['hyper2id'].values)
    obj2id = dict(store['source2id'].values)
    
    HSDs = []
    Dts = []

    with open(trace_fpath) as trace_file:
        for i, l in enumerate(trace_file): 
            if i < to:
                continue
            
            spl = l.strip().split('\t')
            dts_line = [float(x) for x in spl[:mem_size]]
            h = spl[mem_size]
            d = spl[-1]
            sources = spl[mem_size + 1:-1]
            
            all_in = h in hyper2id and d in obj2id
            for s in sources:
                all_in = all_in and s in obj2id
            
            if all_in:
                trace_line = [hyper2id[h]] + [obj2id[s] for s in sources] + \
                        [obj2id[d]]
                HSDs.append(trace_line)
                Dts.append(dts_line)
    
    trace_size = sum(count_z)
    kernel = kernel_class()
    kernel.build(trace_size, count_z.shape[0], residency_priors)
    kernel.update_state(P)
    
    num_queries = min(10000, len(HSDs))
    queries = np.random.choice(len(HSDs), size=num_queries)

    HSDs = np.array(HSDs, dtype='i4')[queries].copy()
    Dts = np.array(Dts, dtype='d')[queries].copy()
    rrs, preds = _eval.reciprocal_rank(Dts, \
            HSDs, previous_stamps, Theta_zh, Psi_sz, count_z, kernel, True)
    
    np.savetxt(out_fpath_rrs, rrs)
    np.savetxt(out_fpath_pred, preds)
    print((1.0 / rrs).mean(axis=0))
    store.close()
Ejemplo n.º 10
0
def initialize_trace(trace_fpath, num_topics, num_iter, \
        from_=0, to=np.inf, initial_assign=None):
    '''
    Given a trace (user trajectories) to learn from, this method will 
    initialize the necessary matrices and dicts to learn tribeflow.
    
    Using from_, to_ the trace can be sliced. initial_assign is useful to 
    pickup learning from a previous model.

    Parameters
    ----------
    trace_fpath : string
        The location of the trace

    num_topics : int
        The number of latent spaces

    from_ : int
        Where to begin reading the trace from. 0 is the first line.

    to : int
        We will stop reading the file here

    initial_assign : array-like
        Initial topic assignments.

    Returns
    -------

    Count matrices and dicts used to learn tribeflow.
    '''
    count_zh_dict = defaultdict(int)
    count_oz_dict = defaultdict(int)
    count_z_dict = defaultdict(int)
    count_h_dict = defaultdict(int)

    hyper2id = OrderedDict()
    obj2id = OrderedDict()

    if initial_assign:
        initial_assign = np.asarray(initial_assign, dtype='i')
        assert initial_assign.min() >= 0
        assert initial_assign.max() < num_topics

    Dts = []
    Trace = []
    with open(trace_fpath, 'r') as trace_file:
        for i, line in enumerate(trace_file):
            if i < from_:
                continue

            if i >= to:
                break

            spl = line.strip().split('\t')
            assert len(spl) >= 4
            assert (len(spl) - 2) % 2 == 0
            mem_size = (len(spl) - 2) // 2

            line_dts = []
            for j in xrange(mem_size):
                line_dts.append(float(spl[j]))
            Dts.append(line_dts)

            hyper_str = spl[mem_size]
            if hyper_str not in hyper2id:
                hyper2id[hyper_str] = len(hyper2id)

            if not initial_assign:
                z = np.random.randint(num_topics)
            else:
                z = initial_assign[i]

            h = hyper2id[hyper_str]
            count_zh_dict[z, h] += 1
            count_h_dict[h] += 1

            line_int = [h]
            for j in xrange(mem_size + 1, len(spl)):
                obj_str = spl[j]

                if obj_str not in obj2id:
                    obj2id[obj_str] = len(obj2id)

                o = obj2id[obj_str]
                line_int.append(o)

                count_oz_dict[o, z] += 1
                count_z_dict[z] += 1

            line_int.append(z)
            Trace.append(line_int)

    #Sort by the last residency time.
    Dts = np.asarray(Dts)
    argsort = Dts[:, -1].argsort()
    assert Dts.shape[1] == mem_size

    #Create contiguous arrays, not needed but adds a small speedup
    Dts = np.asanyarray(Dts[argsort], order='C')
    Trace = np.asarray(Trace)
    Trace = np.asanyarray(Trace[argsort], dtype='i4', order='C')

    nh = len(hyper2id)
    no = len(obj2id)
    nz = num_topics

    previous_stamps = StampLists(num_topics)
    for z in xrange(nz):
        idx = Trace[:, -1] == z
        topic_stamps = Dts[:, -1][idx]
        previous_stamps._extend(z, topic_stamps)

    Count_zh = np.zeros(shape=(nz, nh), dtype='i4')
    Count_oz = np.zeros(shape=(no, nz), dtype='i4')
    count_h = np.zeros(shape=(nh, ), dtype='i4')
    count_z = np.zeros(shape=(nz, ), dtype='i4')

    for z in xrange(Count_zh.shape[0]):
        count_z[z] = count_z_dict[z]

        for h in xrange(Count_zh.shape[1]):
            count_h[h] = count_h_dict[h]
            Count_zh[z, h] = count_zh_dict[z, h]

        for o in xrange(Count_oz.shape[0]):
            Count_oz[o, z] = count_oz_dict[o, z]

    assert (Count_oz.sum(axis=0) == count_z).all()

    prob_topics_aux = np.zeros(nz, dtype='f8')
    Theta_zh = np.zeros(shape=(nz, nh), dtype='f8')
    Psi_oz = np.zeros(shape=(no, nz), dtype='f8')

    return Dts, Trace, previous_stamps, Count_zh, Count_oz, \
            count_h, count_z, prob_topics_aux, Theta_zh, Psi_oz, \
            hyper2id, obj2id
Ejemplo n.º 11
0
def main(model, out_fpath_rrs, out_fpath_pred):
    store = pd.HDFStore(model)

    from_ = store['from_'][0][0]
    to = store['to'][0][0]
    assert from_ == 0

    trace_fpath = store['trace_fpath'][0][0]
    kernel_class = store['kernel_class'][0][0]
    kernel_class = eval(kernel_class)

    Theta_zh = store['Theta_zh'].values
    Psi_sz = store['Psi_sz'].values
    count_z = store['count_z'].values[:, 0]
    P = store['P'].values
    residency_priors = store['residency_priors'].values[:, 0]

    previous_stamps = StampLists(
        count_z.shape[0])  #previous_stamps has a length = nz

    #HERE
    mem_size = store['Dts'].values.shape[1]
    tstamps = store[
        'Dts'].values[:,
                      0]  #tstamps (#trainingLines,) contains the  t(xi)-t(xi-1) (i.e. the col in the dts)
    assign = store[
        'assign'].values[:,
                         0]  # assign (#trainingLines,) each dim has the env id used in that training instance
    for z in xrange(count_z.shape[0]):
        idx = assign == z
        previous_stamps._extend(
            z, tstamps[idx]
        )  #tstamps[idx]: is the tstamps whose corresponding index in idx is True

    hyper2id = dict(store['hyper2id'].values)
    obj2id = dict(store['source2id'].values)

    HSDs = []
    Dts = []

    with open(trace_fpath) as trace_file:
        for i, l in enumerate(trace_file):
            if i < to:
                continue  #skipping the traininglines to reach to testSet

            spl = l.strip().split('\t')
            dts_line = [float(x) for x in spl[:mem_size]]
            h = spl[mem_size]
            d = spl[-1]
            sources = spl[mem_size + 1:-1]

            all_in = h in hyper2id and d in obj2id
            for s in sources:
                all_in = all_in and s in obj2id

            if all_in:
                trace_line = [hyper2id[h]] + [obj2id[s]
                                              for s in sources] + [obj2id[d]]
                HSDs.append(trace_line)
                Dts.append(dts_line)

    trace_size = sum(count_z)  #sum of the number of appearances of all envs
    kernel = kernel_class()
    kernel.build(trace_size, count_z.shape[0], residency_priors)
    kernel.update_state(P)

    num_queries = min(10000, len(HSDs))
    queries = np.random.choice(len(HSDs), size=num_queries)

    HSDs = np.array(HSDs, dtype='i4')[queries].copy()
    Dts = np.array(Dts, dtype='d')[queries].copy()
    rrs, preds = _eval.reciprocal_rank(Dts, HSDs, previous_stamps, Theta_zh,
                                       Psi_sz, count_z, kernel, True)

    np.savetxt(out_fpath_rrs, rrs)
    np.savetxt(out_fpath_pred, preds)
    print(rrs.mean(axis=0))
    store.close()
Ejemplo n.º 12
0
def initialize_trace(trace_fpath, num_topics, num_iter, \
        from_=0, to=np.inf, initial_assign=None):
    '''
    Given a trace (user trajectories) to learn from, this method will 
    initialize the necessary matrices and dicts to learn tribeflow.
    
    Using from_, to_ the trace can be sliced. initial_assign is useful to 
    pickup learning from a previous model.

    Parameters
    ----------
    trace_fpath : string
        The location of the trace

    num_topics : int
        The number of latent spaces

    from_ : int
        Where to begin reading the trace from. 0 is the first line.

    to : int
        We will stop reading the file here

    initial_assign : array-like
        Initial topic assignments.

    Returns
    -------

    Count matrices and dicts used to learn tribeflow.
    '''
    count_zh_dict = defaultdict(int)
    count_oz_dict = defaultdict(int)
    count_z_dict = defaultdict(int)
    count_h_dict = defaultdict(int)

    hyper2id = OrderedDict()
    obj2id = OrderedDict()
    
    if initial_assign:
        initial_assign = np.asarray(initial_assign, dtype='i')
        assert initial_assign.min() >= 0
        assert initial_assign.max() < num_topics

    Dts = []
    Trace = []
    with open(trace_fpath, 'r') as trace_file:
        for i, line in enumerate(trace_file):
            if i < from_: 
                continue

            if i >= to:
                break

            spl = line.strip().split('\t')
            assert len(spl) >= 4
            assert (len(spl) - 2) % 2 == 0
            mem_size = (len(spl) - 2) // 2
            
            line_dts = []
            for j in xrange(mem_size):
                line_dts.append(float(spl[j]))
            Dts.append(line_dts)

            hyper_str = spl[mem_size]
            if hyper_str not in hyper2id:
                hyper2id[hyper_str] = len(hyper2id)
            
            if not initial_assign:
                z = np.random.randint(num_topics)
            else:
                z = initial_assign[i]

            h = hyper2id[hyper_str]
            count_zh_dict[z, h] += 1
            count_h_dict[h] += 1
            
            line_int = [h]
            for j in xrange(mem_size + 1, len(spl)):
                obj_str = spl[j]
                
                if obj_str not in obj2id:
                    obj2id[obj_str] = len(obj2id)
            
                o = obj2id[obj_str]
                line_int.append(o)
                
                count_oz_dict[o, z] += 1
                count_z_dict[z] += 1 
                       
            line_int.append(z)
            Trace.append(line_int)
    
    #Sort by the last residency time. 
    Dts = np.asarray(Dts)
    argsort = Dts[:, -1].argsort()
    assert Dts.shape[1] == mem_size

    #Create contiguous arrays, not needed but adds a small speedup
    Dts = np.asanyarray(Dts[argsort], order='C')
    Trace = np.asarray(Trace)
    Trace = np.asanyarray(Trace[argsort], dtype='i4', order='C')

    nh = len(hyper2id)
    no = len(obj2id)
    nz = num_topics
    
    previous_stamps = StampLists(num_topics)
    for z in xrange(nz):
        idx = Trace[:, -1] == z
        topic_stamps = Dts[:, -1][idx]
        previous_stamps._extend(z, topic_stamps)

    Count_zh = np.zeros(shape=(nz, nh), dtype='i4')
    Count_oz = np.zeros(shape=(no, nz), dtype='i4')
    count_h = np.zeros(shape=(nh,), dtype='i4')
    count_z = np.zeros(shape=(nz,), dtype='i4')
    
    for z in xrange(Count_zh.shape[0]):
        count_z[z] = count_z_dict[z]

        for h in xrange(Count_zh.shape[1]):
            count_h[h] = count_h_dict[h]
            Count_zh[z, h] = count_zh_dict[z, h]

        for o in xrange(Count_oz.shape[0]):
            Count_oz[o, z] = count_oz_dict[o, z]
    
    assert (Count_oz.sum(axis=0) == count_z).all()

    prob_topics_aux = np.zeros(nz, dtype='f8')
    Theta_zh = np.zeros(shape=(nz, nh), dtype='f8')
    Psi_oz = np.zeros(shape=(no, nz), dtype='f8')
    
    return Dts, Trace, previous_stamps, Count_zh, Count_oz, \
            count_h, count_z, prob_topics_aux, Theta_zh, Psi_oz, \
            hyper2id, obj2id