Esempio n. 1
0
        def recombination(s1, s2):
            """
                    Recombine genes of two solutions. If s2 is None, simply return s1

                    Args:
                        ----------
                        s1 (numpy.array) : 1d array representing a solution.
                        s2 (numpy.array) : 1d array representing a solution.
                    Returns:
                        ----------
                        tuple of np.array: recombined solutions.
            """
            # create children
            c1, c2 = np.full_like(s1,
                                  fill_value=-1), np.full_like(s1,
                                                               fill_value=-1)
            # positions where both are valid
            r = np.where(np.logical_and(s1 != -1, s2 != -1))[0]
            # positions where exactly one is valid
            q = np.where(np.logical_xor(s1 != -1, s2 != -1))[0]
            q_sol = s1[q] + s2[q] + 1  # valus at position q different from -1
            # find best solution among r
            r_size = r.shape[0]
            best_coef, best_sol = np.inf, None
            if r_size:
                for sol in cartesian(np.vstack([s1[r], s2[r]]).T):
                    c1[r] = sol
                    coef = self.sparcity_coefficient(c1, r_size)
                    if coef < best_coef:
                        best_coef, best_sol = coef, sol
                c1[r] = best_sol
            # greedily find best solution among q
            dimensions_inserted = r_size
            taken_q = np.zeros_like(q)
            while dimensions_inserted != self.dimensionality:
                dimensions_inserted += 1

                best_coef, best_index, current_best_index = np.inf, None, None
                for j, (taken, q_index) in enumerate(zip(taken_q, q)):
                    if not taken:
                        c1[q_index] = q_sol[j]
                        # print(c1, dimensions_inserted)
                        coef = self.sparcity_coefficient(
                            c1, dimensions_inserted)
                        if coef < best_coef:
                            best_coef, best_index, current_best_index = coef, q_index, j
                        c1[q_index] = -1

                c1[best_index] = q_sol[current_best_index]
                taken_q[current_best_index] = 1

            # make c2 complemtary to c1
            crossover_positions = np.concatenate([r, q])
            c2[crossover_positions] = s1[crossover_positions] + s2[
                crossover_positions] - c1[crossover_positions]
            return c1, c2
Esempio n. 2
0
def roller(thetas, rhos, radius):
    # Clipper works only with integers, scaling needed
    p = cartesian(thetas, rhos)
    scale = 1 / (rhos.max() *
                 (1 - np.cos(np.pi / thetas.shape[0])))  # very good
    p *= scale
    coords = p.astype(int)
    pco = pyclipper.PyclipperOffset()
    pco.AddPath(coords, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
    result = pco.Execute(-radius * scale)[0]
    p = polar(*zip(*result))
    p[1] /= scale
    return p
Esempio n. 3
0
 def __getattr__(self, name):
     if name == 'theta':
         return self.pcoords[0]
     elif name == 'rho':
         return self.pcoords[1]
     elif name == 'ppoints':
         return self.pcoords.T
     elif name == 'points':
         if self._points is None:
             self._points = cartesian(*self.pcoords)
         return self._points
     elif name == 'coords':
         return self.points.T
     return None
Esempio n. 4
0
def init_neg_samples():
    with open(TEST_NEG_SAMPLES, 'rb') as fp:
        data = json.loads(fp.read())
        new_session_num = 0
        new_sample_num = 0

        # need to normalize trip to start at 0,0
        x0 = data['features'][0]['geometry']['coordinates'][1]
        y0 = data['features'][0]['geometry']['coordinates'][0]
        (x0, y0) = cartesian(x0, y0)[:2]
        for feature in data['features']:
            (x, y) = cartesian(feature['geometry']['coordinates'][1],
                               feature['geometry']['coordinates'][0])[:2]
            new_log = Logs(driverID=2,
                           sessionNum=new_session_num,
                           sampleNum=new_sample_num,
                           time=feature['properties']['time'],
                           timeLong=feature['properties']['time_long'],
                           xCoord=x - x0,
                           yCoord=y - y0)
            db.session.add(new_log)
            new_sample_num += 1
        db.session.commit()
Esempio n. 5
0
def stl(cam, filename, width, conj=False):
    if conj:
        points = cartesian(*cam.conj_pcoords)
    else:
        points = cam.points

    # 2D Delaunay triangulation for front face and building of lower and upper faces
    tri0 = points[Delaunay(points).simplices]
    tri1 = np.concatenate((tri0, np.zeros([tri0.shape[0], tri0.shape[1], 1])),
                          2)
    tri2 = np.concatenate(
        (tri0, np.ones([tri0.shape[0], tri0.shape[1], 1]) * width), 2)

    # Build triangles for side face
    vertices1 = np.concatenate((points, np.zeros([points.shape[0], 1])), 1)
    vertices2 = np.concatenate((points, np.ones([points.shape[0], 1]) * width),
                               1)
    tri3_1 = np.empty([points.shape[0], 3, 3])
    tri3_2 = np.empty_like(tri3_1)
    for i in range(0, vertices1.shape[0]):
        tri3_1[i] = [vertices1[i - 1], vertices1[i], vertices2[i]]
        tri3_2[i] = [vertices2[i - 1], vertices2[i], vertices1[i - 1]]

    # Concatenate and save
    tri = np.concatenate((tri1, tri2, tri3_1, tri3_2))
    data = np.zeros(tri.shape[0], dtype=mesh.Mesh.dtype)
    data['vectors'] = tri
    prism = mesh.Mesh(data)
    prism.save(filename)

    # Ugly plotting
    fig = plt.figure()
    ax = mplot3d.Axes3D(fig)
    ax.add_collection3d(mplot3d.art3d.Poly3DCollection(prism.vectors))

    # Auto scale
    scale = prism.points.flatten(-1)
    ax.auto_scale_xyz(scale, scale, scale)

    plt.show()
Esempio n. 6
0
def get_experiments(cfg):
    """Creates a list of experiments."""
    # TODO: sampling
    exps = utils.cartesian(cfg.optspace)
    if hasattr(cfg, 'constraints'):
        for c in cfg.constraints:
            exps_tmp = []
            for exp in exps:
                if c(exp):
                    exps_tmp.append(exp)
            exps = exps_tmp

    if hasattr(cfg, 'optpt_cmp'):
        if sys.version_info < (3, 2):
            exps.sort(cmp=cfg.optpt_cmp)
        else:
            # starting from Python3.2 the `cmp` parameter is removed
            # functools.cmp_to_key is introduced since Python2.7 and Python3.2
            import functools
            exps.sort(key=functools.cmp_to_key(cfg.optpt_cmp))

    return exps
Esempio n. 7
0
 def rot_coords(self, theta0):
     return cartesian(self.theta + theta0, self.rho).T
Esempio n. 8
0
    def fit(self, W, C, vocab, AD, author_list, timestamp_list, verbose=True):
        # {{{ run gibbs sampling
        import sys
        import numpy as np
        from utils import cartesian
        from scipy.stats import poisson
        '''
            W[:,0] -> word index
            W[:,1] -> document index
            W[:,2] -> timestamp index

            C: document-citation sparse matrix
            C[i, t] -> citation count for the ith document at timestamp t

            AD: author-document sparse matrix
        '''

        # number of authors
        A = author_list.size
        # number of unique tokens
        V = vocab.size
        # total number of tokens
        nnz = W.shape[0]
        # number of timestamps
        T = timestamp_list.size

        # save meta-info to this object
        self.vocabulary = vocab
        self.authors = author_list
        self.timestamps = timestamp_list

        # init to one above the max val
        z_states = np.zeros((self.n_iter+1, nnz), dtype=np.uint32) + self.K
        a_states = np.zeros((self.n_iter+1, nnz), dtype=np.uint32) + A
        t_states = np.zeros((self.n_iter+1, nnz), dtype=np.uint32) + T

        # create all needed sequences
        k_range = np.arange(self.K)
        t_range = np.arange(T)
        a_range = np.arange(A)
        v_range = np.arange(V)

        # 1.1 initialize topic assignment randomly
        z_states[0, :] = np.random.choice(self.K, nnz, True)

        # 1.2 initialize author and timestamp assignment
        for i in np.arange(nnz):
            t = W[i, 2]
            di = W[i, 1]
            ad = AD[:, di].nonzero()[0]
            a_states[0, i] = np.random.choice(ad)
            t_states[0, i] = np.random.choice(np.arange(t, T))

        # 2. initialize lambda matrix
        lambda_ = np.zeros((self.K, T), dtype=np.uint32)
        for k in k_range:
            k_indices = np.where(z_states[0, :] == k)[0]
            for t in np.arange(T):
                t_indices = np.where(t_states[0, :] == t)[0]
                kt_indices = np.intersect1d(t_indices, k_indices)
                d_indices = W[kt_indices, 1]
                lambda_[k, t] = C[d_indices, t].mean()

        # zeros set to overall mean
        lam_x, lam_y = np.where(lambda_ == 0)
        if lam_x.size:
            lambda_[lam_x, lam_y] = C.mean()/float((lam_x.size))

        # 3. sample
        # {{{
        for iter_ in np.arange(1, self.n_iter+1):

            if verbose:
                print 'Iter %i...... (Total %i)' %(iter_, self.n_iter)
                sys.stdout.flush()

            else:
                if iter_ % 100 :
                    print 'Iter %i...... (Total %i)' %(iter_, self.n_iter)
                    sys.stdout.flush()
                
            for i in np.arange(nnz):
            #{{{ sample each token sequentially

                # {{{ denominators
                den_author = np.zeros(A, dtype=np.float_)
                for a in a_range:
                    for k in k_range:
                        # words that are assigned to topic k, excluding the current one
                        k_indices = np.append(np.where(z_states[iter_-1, i+1:]==k)[0] + (i+1),\
                                              np.where(z_states[iter_, :i]==k)[0])
                        n_a_k_i = (a_states[iter_-1, k_indices[k_indices > i]] == a).sum() + \
                                  (a_states[iter_, k_indices[k_indices < i]] == a).sum()
                        den_author[a] += n_a_k_i
                    den_author[a] += self.K*self.alpha
                
                den_timestamp = np.zeros(self.K, dtype=np.float_)
                den_token = np.zeros(self.K, dtype=np.float_)
                for k in k_range:

                    for t in t_range:
                        # words that are assigned to timestamp t, excluding the current one
                        t_indices = np.append(np.where(t_states[iter_-1, i+1:]==t)[0] + (i+1),\
                                              np.where(t_states[iter_, :i]==t)[0])
                        n_k_t_i = (z_states[iter_-1, t_indices[t_indices > i]] == k).sum() + \
                                  (z_states[iter_, t_indices[t_indices < i]] == k).sum()
                        den_timestamp[k] += n_k_t_i
                    den_timestamp[k] += T*self.pi

                    for v in v_range:
                        # words that are tokens v, excluding the current one
                        v_indices = np.append(np.where(W[i+1:, 0] == v)[0] + (i+1), np.where(W[:i, 0] == v)[0])
                        n_k_v_i = (z_states[iter_-1, v_indices[v_indices > i]] == k).sum() + \
                                  (z_states[iter_, v_indices[v_indices < i]] == k).sum()
                        den_token[k] += n_k_v_i
                    den_token[k] += V*self.beta

                # }}}

                v = W[i, 0]
                t = W[i, 2]
                di = W[i, 1]
                ci = C[di, :]

                # find its authors
                ad = AD[:,di].nonzero()[0]

                comb_list = cartesian((np.arange(t, T), k_range, ad))
                comb_p_list = np.zeros(comb_list.shape[0], dtype=np.float_)

                # excluding the current one
                v_indices = np.append(np.where(W[i+1:, 0] == v)[0] + (i+1), np.where(W[:i, 0] == v)[0])

                # {{{ for each combination, obtain full conditional probability
                for comb_index in np.arange(comb_p_list.size):

                    comb = comb_list[comb_index]
                    t, k, a = comb

                    # 1
                    t_indices = np.append(np.where(t_states[iter_-1, i+1:]==t)[0] + (i+1),\
                                          np.where(t_states[iter_, :i]==t)[0])
                    n_k_t_i = (z_states[iter_-1, t_indices[t_indices > i]] == k).sum() + \
                              (z_states[iter_, t_indices[t_indices < i]] == k).sum()
                    p1 = (n_k_t_i + self.pi)/den_timestamp[k]

                    # 2
                    n_k_v_i = (z_states[iter_-1, v_indices[v_indices > i]] == k).sum() + \
                              (z_states[iter_, v_indices[v_indices < i]] == k).sum()
                    p2 = (n_k_v_i + self.beta)/den_token[k]

                    # 3
                    # excluding the current one
                    k_indices = np.append(np.where(z_states[iter_-1, i+1:] == k)[0] + (i+1),\
                                          np.where(z_states[iter_, :i] == k)[0])
                    n_a_k_i = (a_states[iter_-1, k_indices[k_indices > i]] == a).sum() + \
                              (a_states[iter_, k_indices[k_indices < i]] == a).sum()
                    p3 = (n_a_k_i + self.alpha)/den_author[a]

                    # poisson pmf
                    p4 = poisson.pmf(ci[t], mu=lambda_[k,t])

                    #print p1, p2, p3, p4, lambda_[k,t], ci[t]
                    comb_p_list[comb_index] = p1*p2*p3*p4
                # }}}
                
                # rescale to [0,1]
                comb_p_list = comb_p_list/comb_p_list.sum()

                # sample for i-th word
                comb_index = np.random.choice(np.arange(comb_p_list.size), p=comb_p_list)
                t, k, a = comb_list[comb_index]
                t_states[iter_, i] = t
                z_states[iter_, i] = k
                a_states[iter_, i] = a
            #}}} END for i-th TOKEN

            # update lambda after each iteration
            for k in k_range:
                k_indices = np.where(z_states[iter_, :] == k)[0]
                for t in t_range:
                    t_indices = np.where(t_states[iter_, :] == t)[0]
                    kt_indices = np.intersect1d(k_indices, t_indices)
                    d_indices = W[kt_indices, 1]
                    # if no word is assigned to topic k and timestamp t, keep it as before
                    if d_indices.size > 0:
                        lambda_[k, t] = C[d_indices, t].mean()
        # }}}

        # 4. obtain \theta, \phi, and \psi

        # burn-in: first half
        z_samples = z_states[1:, :][self.n_iter/2:, :]
        a_samples = a_states[1:, :][self.n_iter/2:, :]
        t_samples = t_states[1:, :][self.n_iter/2:, :]

        # author-topic
        theta = np.zeros((A, self.K), dtype=np.float_)
        # topic-word
        phi = np.zeros((self.K, V), dtype=np.float_)
        # topic-timestamp
        psi = np.zeros((self.K, T), dtype=np.float_)

        for a in a_range:
            den = self.K * self.alpha + (a_samples==a).sum()
            a_x, a_y = np.where(a_samples==a)
            for k in k_range:
                n_a_k = (z_samples[a_x, a_y] == k).sum()
                theta[a,k] = float(n_a_k+self.alpha) / (den)

        for k in k_range:
            k_count = (z_samples==k).sum()
            den_v = V * self.beta + k_count
            den_t = T * self.pi + k_count
            # x is iteration number, y is word index
            k_x, k_y = np.where(z_samples==k)
            for v in v_range:
                n_k_v = (W[k_y, 0]==v).sum()
                phi[k,v] = float(n_k_v+self.beta) / den_v

            for t in t_range:
                n_k_t = (t_samples[k_x, k_y]==t).sum()
                psi[k,t] = float(n_k_t+self.pi) / den_t

                # update lambda
                t_x, t_y = np.where(t_samples == t)
                kt_indices = np.intersect1d(k_y, t_y)
                d_indices = W[kt_indices, 1]
                # if no word is assigned to topic k and timestamp t, keep it as before
                if d_indices.size > 0:
                    lambda_[k, t] = C[d_indices, t].mean()

        self.theta = theta
        self.phi = phi
        self.psi = psi
        self.lambda_ = lambda_
        self.z_samples = z_samples
        self.a_samples = a_samples
        self.t_samples = t_samples
        # }}}
        return theta, phi, psi, lambda_
Esempio n. 9
0
def post_prediction(tma_id):
    tma_exists = db.session.query(TMAs.tmaID).filter_by(tmaID=tma_id).scalar()
    if tma_exists is None:
        return jsonify({
            "status": "fail",
            "data": {
                "tma_id": "%s is not registered" % tma_id
            }
        }), 400

    driver_id = tmaID_to_driverID(tma_id)
    if driver_id is None:
        return jsonify({
            "status": "fail",
            "data": {
                "tma_id":
                "%s does not correspond to a driver according to dispatch" %
                tma_id
            }
        }), 404

    if 'log' not in request.files:
        return jsonify({
            "status": "fail",
            "data": {
                "log": "no log file attached"
            }
        }), 400
    else:
        file = request.files['log']
        data = json.loads(file.read())
        try:
            new_session_num = db.session.query(func.max(
                Logs.sessionNum)).filter_by(driverID=driver_id).scalar() + 1
        except TypeError:
            new_session_num = 0
        new_sample_num = 0
        start_time = data['features'][0]['properties']['time']

        # need to normalize trip to start at 0,0
        x0 = data['features'][0]['geometry']['coordinates'][1]
        y0 = data['features'][0]['geometry']['coordinates'][0]
        (x0, y0) = utils.cartesian(x0, y0)[:2]
        for feature in data['features']:
            (x, y) = utils.cartesian(feature['geometry']['coordinates'][1],
                                     feature['geometry']['coordinates'][0])[:2]
            new_log = Logs(driverID=driver_id,
                           sessionNum=new_session_num,
                           sampleNum=new_sample_num,
                           time=feature['properties']['time'],
                           timeLong=feature['properties']['time_long'],
                           xCoord=x - x0,
                           yCoord=y - y0)
            db.session.add(new_log)
            new_sample_num += 1
        db.session.commit()

        # temp prediction
        p = multiprocessing.Process(target=make_prediction_async,
                                    args=(
                                        driver_id,
                                        new_session_num,
                                        start_time,
                                    ))
        #p = multiprocessing.Process(target=make_false_prediction_async, args=(driver_id, new_session_num, start_time,))
        p.start()
        return jsonify({"status": "success", "data": None}), 200
Esempio n. 10
0
#     for j in range(0, x.shape[1]):
#         if a[i, j] == 1:
#             for t1 in range(0, 2):
#                 p = utils.conditional_probability(
#                     x, j, np.equal, 1, i, np.equal, t1)
#                 print 'P({:s}=1|{:s}={:d})={:3.2f}%'.format(att[j], att[i], t1, 100.0 * p)
#                 print 'P({:s}=0|{:s}={:d})={:3.2f}%'.format(att[j], att[i], t1, 100.0 * (1 - p))
#             print '\n'


c = []
for j in [2]:  # observando apenas os pais de E
    for i in range(0, x.shape[1]):
        if a[i, j] == 1:
            c.append([0, 1])  # para cada pai de E, adiciono um vetor em c.

comb = utils.cartesian(c)

for i in range(0, len(comb)):
    ind = np.logical_and(
        np.equal(x[:, 0], comb[i, 0]), np.equal(x[:, 1], comb[i, 1]))
    ind = np.logical_and(ind, np.equal(x[:, 3], comb[i, 2]))
    ind = np.logical_and(ind, np.equal(x[:, 4], comb[i, 3]))
    xl = x[ind, :]
    p = utils.simple_probability(xl, 2, np.equal, 1)
    if p == -1:
        print 'P(C={:d},F={:d},M={:d},A={:d})=0%\n'.format(comb[i, 0], comb[i, 1], comb[i, 2], comb[i, 3])
    else:
        print 'P(E=1|C={:d},F={:d},M={:d},A={:d})={:3.2f}%'.format(comb[i, 0], comb[i, 1], comb[i, 2], comb[i, 3], 100.0 * p)
        print 'P(E=0|C={:d},F={:d},M={:d},A={:d})={:3.2f}%\n'.format(comb[i, 0], comb[i, 1], comb[i, 2], comb[i, 3], 100.0 * (1 - p))
Esempio n. 11
0
            break
    return topic_lists


# Build the Twitter queries

print 'Building and grouping Twitter queries...'

names = []
queries = []
for p1, p2 in pairwise(parties):
    party_list = [p1, p2]
    topic_lists = group_topics(party_list, topics)
    for topic_list in topic_lists:
        query_total = ''
        couples = cartesian([party_list, topic_list])
        for party, topic in couples:
            query = '((' + party + ' OR #' + party + ') (' + topic + ' OR #' + topic + '))'
            if not query_total:
                query_total = query
            else:
                query_total = query_total + ' OR ' + query
        names.append('#'.join(party_list + topic_list))
        queries.append(query_total)

# Launch the scraping processes

print 'Preparing to launch...'

commands = ''
for i, query in enumerate(queries):
Esempio n. 12
0
#     for j in range(0, x.shape[1]):
#         if a[i, j] == 1:
#             for t1 in range(0, 2):
#                 p = utils.conditional_probability(
#                     x, j, np.equal, 1, i, np.equal, t1)
#                 print 'P({:s}=1|{:s}={:d})={:3.2f}%'.format(att[j], att[i], t1, 100.0 * p)
#                 print 'P({:s}=0|{:s}={:d})={:3.2f}%'.format(att[j], att[i], t1, 100.0 * (1 - p))
#             print '\n'

c = []
for j in [2]:  # observando apenas os pais de E
    for i in range(0, x.shape[1]):
        if a[i, j] == 1:
            c.append([0, 1])  # para cada pai de E, adiciono um vetor em c.

comb = utils.cartesian(c)

for i in range(0, len(comb)):
    ind = np.logical_and(np.equal(x[:, 0], comb[i, 0]),
                         np.equal(x[:, 1], comb[i, 1]))
    ind = np.logical_and(ind, np.equal(x[:, 3], comb[i, 2]))
    ind = np.logical_and(ind, np.equal(x[:, 4], comb[i, 3]))
    xl = x[ind, :]
    p = utils.simple_probability(xl, 2, np.equal, 1)
    if p == -1:
        print 'P(C={:d},F={:d},M={:d},A={:d})=0%\n'.format(
            comb[i, 0], comb[i, 1], comb[i, 2], comb[i, 3])
    else:
        print 'P(E=1|C={:d},F={:d},M={:d},A={:d})={:3.2f}%'.format(
            comb[i, 0], comb[i, 1], comb[i, 2], comb[i, 3], 100.0 * p)
        print 'P(E=0|C={:d},F={:d},M={:d},A={:d})={:3.2f}%\n'.format(
Esempio n. 13
0
def task(tree_file):
    if os.path.exists("assets/processed/stops_aligned/{}".format(
            tree_file.split("/")[-1])):
        print("passed", tree_file)
        return

    try:
        tree = np.load(tree_file, allow_pickle=True)["arr_0"].item()
    except:
        print(tree_file)

    stop_tree = {}

    for route_id in tqdm(tree):
        stops = routes_data[route_id]
        stop_tree[route_id] = {}

        directions = []
        for e in range(1, len(stops)):
            directions.append(
                cartesian(*stops_data[stops[e]][:2]) -
                cartesian(*stops_data[stops[e - 1]][:2]))

        for each_trip in tree[route_id]:
            stop_tree[route_id][each_trip] = [None] * len(stops)

            for start_stop in range(0, len(stops)):

                if tree[route_id][each_trip][start_stop] == None:
                    continue

                trip_stop_data = np.array(
                    tree[route_id][each_trip][start_stop])

                if (len(trip_stop_data) > 1
                        and (np.diff([e[0]
                                      for e in trip_stop_data]) < 0).any()):
                    if (np.count_nonzero(
                            np.diff([e[0] for e in trip_stop_data]) < 0) > 1):
                        continue
                    else:
                        trip_stop_data = sorted(trip_stop_data,
                                                key=lambda e: e[0])

                _, un_repeat_stops = np.unique([e[0] for e in trip_stop_data],
                                               return_index=True)

                trip_stop_data = np.array(trip_stop_data)[un_repeat_stops]

                assert (np.diff([e[0] for e in trip_stop_data]) > 0).all()

                distances = np.array([
                    haversine_dist(*e[2:], *stops_data[stops[start_stop]][:2])
                    for e in trip_stop_data
                ])

                time = np.array([e[0] for e in trip_stop_data])
                close_time = np.argmin(distances)
                close_time_val = time[close_time]

                time -= time[close_time]

                max_range = np.zeros(len(time), dtype=bool)
                max_range[-15 + close_time:close_time] = True
                max_range[close_time:close_time + 15] = True
                useful_indices = np.logical_and(
                    np.logical_and(time < 5 * 60, time > -5 * 60), max_range)
                time = time[useful_indices]
                distances = distances[useful_indices]
                trip_stop_data = trip_stop_data[useful_indices]

                if start_stop == 0:
                    prev_dir = -1 * directions[0]
                    next_dir = directions[0]
                elif start_stop == len(stops) - 1:
                    prev_dir = -1 * directions[len(stops) - 2]
                    next_dir = directions[len(stops) - 2]
                else:
                    prev_dir = -1 * directions[start_stop - 1]
                    next_dir = directions[start_stop]

                prev_dir = np.array([
                    get_angle(
                        prev_dir,
                        cartesian(*e[2:]) -
                        cartesian(*stops_data[stops[start_stop]][:2]),
                    ) for e in trip_stop_data
                ])
                next_dir = np.array([
                    get_angle(
                        next_dir,
                        cartesian(*e[2:]) -
                        cartesian(*stops_data[stops[start_stop]][:2]),
                    ) for e in trip_stop_data
                ])

                backward = prev_dir > next_dir

                displacement = distances * (-1 * (backward - 0.5) * 2)

                useful_indices = longest_subsequence(displacement,
                                                     return_index=True)
                displacement = displacement[useful_indices]
                time = time[useful_indices]
                trip_stop_data = trip_stop_data[useful_indices]

                if (len(displacement) > 1 and -32 > displacement[0]
                        and -32 < displacement[-1]):
                    stop_tree[route_id][each_trip][start_stop] = int(
                        close_time_val + interpolate.interp1d(
                            displacement, time, fill_value="extrapolate")(-32))
                elif len(displacement) > 1:
                    dist_diff = np.diff(displacement)
                    drequired = (displacement[0] + 32 if displacement[0] > -32
                                 else displacement[-1] + 32)
                    velocity_ind = np.argmin(np.abs(dist_diff - drequired))
                    velocity = dist_diff[velocity_ind] / (
                        time[velocity_ind + 1] - time[velocity_ind])

                    trequired = (time[0] - drequired / velocity
                                 if displacement[0] > -32 else time[-1] +
                                 drequired / velocity)

                    stop_tree[route_id][each_trip][start_stop] = int(
                        close_time_val + trequired)
                else:
                    assert len(displacement) == 1
                    speed = trip_stop_data[0][1] * 3.6
                    drequired = displacement[0] + 32

                    if speed == 0:
                        speed = 2.7

                    trequired = (time[0] - drequired / speed
                                 if displacement[0] > -32 else time[-1] +
                                 drequired / speed)
                    stop_tree[route_id][each_trip][start_stop] = int(
                        close_time_val + trequired)

    np.savez_compressed(
        "assets/processed/stops_aligned/{}".format(tree_file.split("/")[-1]),
        stop_tree,
    )
Esempio n. 14
0
    def fit(self, W, C, vocab, AD, author_list, timestamp_list, verbose=True):
        # {{{ run gibbs sampling
        import sys
        import numpy as np
        from utils import cartesian
        from scipy.stats import poisson
        '''
            W[:,0] -> word index
            W[:,1] -> document index
            W[:,2] -> timestamp index

            C: document-citation sparse matrix
            C[i, t] -> citation count for the ith document at timestamp t

            AD: author-document sparse matrix
        '''

        # number of authors
        A = author_list.size
        # number of unique tokens
        V = vocab.size
        # total number of tokens
        nnz = W.shape[0]
        # number of timestamps
        T = timestamp_list.size

        # save meta-info to this object
        self.vocabulary = vocab
        self.authors = author_list
        self.timestamps = timestamp_list

        # init to one above the max val
        z_states = np.zeros((self.n_iter + 1, nnz), dtype=np.uint32) + self.K
        a_states = np.zeros((self.n_iter + 1, nnz), dtype=np.uint32) + A
        t_states = np.zeros((self.n_iter + 1, nnz), dtype=np.uint32) + T

        # create all needed sequences
        k_range = np.arange(self.K)
        t_range = np.arange(T)
        a_range = np.arange(A)
        v_range = np.arange(V)

        # 1.1 initialize topic assignment randomly
        z_states[0, :] = np.random.choice(self.K, nnz, True)

        # 1.2 initialize author and timestamp assignment
        for i in np.arange(nnz):
            t = W[i, 2]
            di = W[i, 1]
            ad = AD[:, di].nonzero()[0]
            a_states[0, i] = np.random.choice(ad)
            t_states[0, i] = np.random.choice(np.arange(t, T))

        # 2. initialize lambda matrix
        lambda_ = np.zeros((self.K, T), dtype=np.uint32)
        for k in k_range:
            k_indices = np.where(z_states[0, :] == k)[0]
            for t in np.arange(T):
                t_indices = np.where(t_states[0, :] == t)[0]
                kt_indices = np.intersect1d(t_indices, k_indices)
                d_indices = W[kt_indices, 1]
                lambda_[k, t] = C[d_indices, t].mean()

        # zeros set to overall mean
        lam_x, lam_y = np.where(lambda_ == 0)
        if lam_x.size:
            lambda_[lam_x, lam_y] = C.mean() / float((lam_x.size))

        # 3. sample
        # {{{
        for iter_ in np.arange(1, self.n_iter + 1):

            if verbose:
                print 'Iter %i...... (Total %i)' % (iter_, self.n_iter)
                sys.stdout.flush()

            else:
                if iter_ % 100:
                    print 'Iter %i...... (Total %i)' % (iter_, self.n_iter)
                    sys.stdout.flush()

            for i in np.arange(nnz):
                #{{{ sample each token sequentially

                # {{{ denominators
                den_author = np.zeros(A, dtype=np.float_)
                for a in a_range:
                    for k in k_range:
                        # words that are assigned to topic k, excluding the current one
                        k_indices = np.append(np.where(z_states[iter_-1, i+1:]==k)[0] + (i+1),\
                                              np.where(z_states[iter_, :i]==k)[0])
                        n_a_k_i = (a_states[iter_-1, k_indices[k_indices > i]] == a).sum() + \
                                  (a_states[iter_, k_indices[k_indices < i]] == a).sum()
                        den_author[a] += n_a_k_i
                    den_author[a] += self.K * self.alpha

                den_timestamp = np.zeros(self.K, dtype=np.float_)
                den_token = np.zeros(self.K, dtype=np.float_)
                for k in k_range:

                    for t in t_range:
                        # words that are assigned to timestamp t, excluding the current one
                        t_indices = np.append(np.where(t_states[iter_-1, i+1:]==t)[0] + (i+1),\
                                              np.where(t_states[iter_, :i]==t)[0])
                        n_k_t_i = (z_states[iter_-1, t_indices[t_indices > i]] == k).sum() + \
                                  (z_states[iter_, t_indices[t_indices < i]] == k).sum()
                        den_timestamp[k] += n_k_t_i
                    den_timestamp[k] += T * self.pi

                    for v in v_range:
                        # words that are tokens v, excluding the current one
                        v_indices = np.append(
                            np.where(W[i + 1:, 0] == v)[0] + (i + 1),
                            np.where(W[:i, 0] == v)[0])
                        n_k_v_i = (z_states[iter_-1, v_indices[v_indices > i]] == k).sum() + \
                                  (z_states[iter_, v_indices[v_indices < i]] == k).sum()
                        den_token[k] += n_k_v_i
                    den_token[k] += V * self.beta

                # }}}

                v = W[i, 0]
                t = W[i, 2]
                di = W[i, 1]
                ci = C[di, :]

                # find its authors
                ad = AD[:, di].nonzero()[0]

                comb_list = cartesian((np.arange(t, T), k_range, ad))
                comb_p_list = np.zeros(comb_list.shape[0], dtype=np.float_)

                # excluding the current one
                v_indices = np.append(
                    np.where(W[i + 1:, 0] == v)[0] + (i + 1),
                    np.where(W[:i, 0] == v)[0])

                # {{{ for each combination, obtain full conditional probability
                for comb_index in np.arange(comb_p_list.size):

                    comb = comb_list[comb_index]
                    t, k, a = comb

                    # 1
                    t_indices = np.append(np.where(t_states[iter_-1, i+1:]==t)[0] + (i+1),\
                                          np.where(t_states[iter_, :i]==t)[0])
                    n_k_t_i = (z_states[iter_-1, t_indices[t_indices > i]] == k).sum() + \
                              (z_states[iter_, t_indices[t_indices < i]] == k).sum()
                    p1 = (n_k_t_i + self.pi) / den_timestamp[k]

                    # 2
                    n_k_v_i = (z_states[iter_-1, v_indices[v_indices > i]] == k).sum() + \
                              (z_states[iter_, v_indices[v_indices < i]] == k).sum()
                    p2 = (n_k_v_i + self.beta) / den_token[k]

                    # 3
                    # excluding the current one
                    k_indices = np.append(np.where(z_states[iter_-1, i+1:] == k)[0] + (i+1),\
                                          np.where(z_states[iter_, :i] == k)[0])
                    n_a_k_i = (a_states[iter_-1, k_indices[k_indices > i]] == a).sum() + \
                              (a_states[iter_, k_indices[k_indices < i]] == a).sum()
                    p3 = (n_a_k_i + self.alpha) / den_author[a]

                    # poisson pmf
                    p4 = poisson.pmf(ci[t], mu=lambda_[k, t])

                    #print p1, p2, p3, p4, lambda_[k,t], ci[t]
                    comb_p_list[comb_index] = p1 * p2 * p3 * p4
                # }}}

                # rescale to [0,1]
                comb_p_list = comb_p_list / comb_p_list.sum()

                # sample for i-th word
                comb_index = np.random.choice(np.arange(comb_p_list.size),
                                              p=comb_p_list)
                t, k, a = comb_list[comb_index]
                t_states[iter_, i] = t
                z_states[iter_, i] = k
                a_states[iter_, i] = a
            #}}} END for i-th TOKEN

            # update lambda after each iteration
            for k in k_range:
                k_indices = np.where(z_states[iter_, :] == k)[0]
                for t in t_range:
                    t_indices = np.where(t_states[iter_, :] == t)[0]
                    kt_indices = np.intersect1d(k_indices, t_indices)
                    d_indices = W[kt_indices, 1]
                    # if no word is assigned to topic k and timestamp t, keep it as before
                    if d_indices.size > 0:
                        lambda_[k, t] = C[d_indices, t].mean()
        # }}}

        # 4. obtain \theta, \phi, and \psi

        # burn-in: first half
        z_samples = z_states[1:, :][self.n_iter / 2:, :]
        a_samples = a_states[1:, :][self.n_iter / 2:, :]
        t_samples = t_states[1:, :][self.n_iter / 2:, :]

        # author-topic
        theta = np.zeros((A, self.K), dtype=np.float_)
        # topic-word
        phi = np.zeros((self.K, V), dtype=np.float_)
        # topic-timestamp
        psi = np.zeros((self.K, T), dtype=np.float_)

        for a in a_range:
            den = self.K * self.alpha + (a_samples == a).sum()
            a_x, a_y = np.where(a_samples == a)
            for k in k_range:
                n_a_k = (z_samples[a_x, a_y] == k).sum()
                theta[a, k] = float(n_a_k + self.alpha) / (den)

        for k in k_range:
            k_count = (z_samples == k).sum()
            den_v = V * self.beta + k_count
            den_t = T * self.pi + k_count
            # x is iteration number, y is word index
            k_x, k_y = np.where(z_samples == k)
            for v in v_range:
                n_k_v = (W[k_y, 0] == v).sum()
                phi[k, v] = float(n_k_v + self.beta) / den_v

            for t in t_range:
                n_k_t = (t_samples[k_x, k_y] == t).sum()
                psi[k, t] = float(n_k_t + self.pi) / den_t

                # update lambda
                t_x, t_y = np.where(t_samples == t)
                kt_indices = np.intersect1d(k_y, t_y)
                d_indices = W[kt_indices, 1]
                # if no word is assigned to topic k and timestamp t, keep it as before
                if d_indices.size > 0:
                    lambda_[k, t] = C[d_indices, t].mean()

        self.theta = theta
        self.phi = phi
        self.psi = psi
        self.lambda_ = lambda_
        self.z_samples = z_samples
        self.a_samples = a_samples
        self.t_samples = t_samples
        # }}}
        return theta, phi, psi, lambda_