Ejemplo n.º 1
0
def bm25(corpus, b, k1, stopword):
    CV = CountVectorizer(ngram_range=(1, 1),
                         stop_words=stopword,
                         min_df=5,
                         max_df=0.3)
    IDFTrans = TfidfTransformer(norm='l2')

    output = CV.fit_transform(corpus)
    IDFTrans.fit(output)
    temp = output.copy()

    aveL = output.sum() / output.shape[0]
    denominator = k1 * ((1 - b) + b * (output.sum(1) / aveL))

    temp.data = temp.data / temp.data
    temp = csr_matrix.multiply(temp, denominator)

    temp += output
    output *= (k1 + 1)

    temp.data = 1 / temp.data
    output = csr_matrix.multiply(output, temp)

    output = IDFTrans.transform(output)

    return output
def bm25(corpus, b, k1, stopword):
    CV = CountVectorizer(ngram_range=(1, 1),
                         stop_words=stopword,
                         min_df=5,
                         max_df=0.3,
                         max_features=5000)
    IDFTrans = TfidfTransformer(norm='l2')

    output = CV.fit_transform(corpus)
    IDFTrans.fit(output)
    temp = output.copy()

    aveL = output.sum() / output.shape[0]
    denominator = k1 * ((1 - b) + b * (output.sum(1) / aveL))

    # set elements of every row to k1*((1-b)+b*(docL/aveL))
    temp.data = temp.data / temp.data
    temp = csr_matrix.multiply(temp, denominator)

    # to tf + k1*((1-b)+b*(docL/aveL))
    temp += output
    output *= (k1 + 1)

    # reciprocal and then multiply
    temp.data = 1 / temp.data
    output = csr_matrix.multiply(output, temp)

    output = IDFTrans.transform(output)

    return output
Ejemplo n.º 3
0
def buildAttributeGraph(X_select,PG):
    n,T = X_select.shape
    P_Ai = np.zeros([n, n, T])
    for i_attribute in range(T):
        P_Ai[:,:,i_attribute] = PG.toarray()
        tmp = X_select[:,i_attribute].dot(  csr_matrix.multiply( csc_matrix.transpose(X_select[:,i_attribute]), 1/(1e-64 + X_select[:,i_attribute].sum())) )
        flyout_ind, cols = X_select[:, i_attribute].nonzero()
        P_Ai[flyout_ind,:,i_attribute] = tmp[flyout_ind, :].toarray()
    return P_Ai
Ejemplo n.º 4
0
def  deepwalk_infty_Embedding(PG, option):
    alpha = option['flyout']
    dimension = option['dimension']
    Pinv = inv( (csr_matrix(eye(n)) - csr_matrix.multiply(PG, alpha)).toarray() )  # Pinv = (I-\alpha*P)
    Pi = np.divide(  Pinv - csr_matrix(eye(n)), alpha )  # (Pinv -I)/alpha
    ind = find( Pi < 1e-16 )
    Pi[ind[0],ind[1]] = 1e-16
    Y = np.log(Pi)
    U, Sigma, VT = randomized_svd(Y, n_components=dimension, n_iter= 30, random_state=None)
    Uw = U.dot( np.diag( np.sqrt(Sigma)) )
    return Uw
Ejemplo n.º 5
0
def adagrad(X_train, y_train, X_test, y_test, lambda_, eta, B, iterations):
    train_error = []
    test_error = []
    w = csr_matrix([0] * X_train.shape[1])
    G = csr_matrix([1] * X_train.shape[1])
    for iteration in range(iterations):
        sum = csr_matrix([0] * X_train.shape[1])
        indexes = random.sample(range(X_train.shape[0]), B)
        for index in indexes:
            multiply_matrix = w.dot(X_train[index, :].T) * y_train[index]
            if multiply_matrix < 1:
                sum = sum + X_train[index, :] * y_train[index]
        gradient = lambda_ * w - sum / B
        gradient = csr_matrix(gradient)
        g_sqrt = np.sqrt(G + csr_matrix.multiply(gradient, gradient))
        w_ = w - csr_matrix.multiply(csr_matrix(csr_matrix([eta] * X_train.shape[1]) / g_sqrt), gradient)
        res = csr_matrix.multiply(g_sqrt, w_)
        w = min(1.0, 1.0 / math.sqrt(lambda_) / norm(res)) * w_
        if iteration % 10 == 0:
            train_error.append(compute_error(X_train, y_train, w))
            test_error.append(compute_error(X_test, y_test, w))

    return w, train_error, test_error
Ejemplo n.º 6
0
def f_Achap(M, Zchap, ZZchap, dchap, N, K):
    """ Computation of the matrix (A 'kron' I) """

    L = 0.  # norm of the matrix
    Achap = []
    for k in range(K):
        Bchap_k = kron(
            csr_matrix(khatri_rao((Zchap[k], ZZchap[k]))).dot(M[k]),
            identity(N))
        Asparse = csr_matrix.multiply(csr_matrix(dchap[k][:, None]),
                                      Bchap_k)  #  fast diag product
        Achap.append(Asparse)

        L += square_mat_csr(Asparse)
    return hstack(Achap), L
Ejemplo n.º 7
0
def Muititau_Corr(img, dpl):

    import numpy as np
    import scipy as sp
    import math
    from scipy.sparse import csr_matrix

    [num_frame, num_pixel] = img.shape

    multitau_length_lim = (int(math.log(num_frame, 2)) + 1) * dpl

    G2 = np.zeros([multitau_length_lim, num_pixel])
    IP = np.zeros([multitau_length_lim, num_pixel])
    IF = np.zeros([multitau_length_lim, num_pixel])
    t_el = np.zeros([multitau_length_lim])

    for ii in range(dpl):

        delay_orig = ii + 1
        print("MultiTau Spacing %2i with %7i Total Number of Frames" %
              (delay_orig, num_frame))

        G2[ii, :] = csr_matrix.multiply(img[delay_orig:, :],
                                        img[:-delay_orig, :]).mean(axis=0)
        IP[ii, :] = (img[delay_orig:, :]).mean(axis=0)
        IF[ii, :] = (img[:-delay_orig, :]).mean(axis=0)
        t_el[ii] = delay_orig

    level = 0
    img_bin = img

    count = dpl

    while True:
        for kk in range(dpl):

            delay_bin = kk + dpl + 1
            delay_orig = 2**level * delay_bin
            print("MultiTau Spacing %2i with %7i Total Number of Frames" %
                  (delay_orig, img.shape[0]))

            if delay_bin >= img_bin.shape[0]:
                G2 = G2[:count, :]
                IP = IP[:count, :]
                IF = IF[:count, :]
                t_el = t_el[:count]
                return G2, IP, IF, t_el
            else:
                G2[count, :] = csr_matrix.multiply(
                    img_bin[delay_bin:, :],
                    img_bin[:-delay_bin, :]).mean(axis=0)
                IP[count, :] = img_bin[delay_bin:, :].mean(axis=0)
                IF[count, :] = img_bin[:-delay_bin, :].mean(axis=0)
                t_el[count] = delay_orig

                count = count + 1

        even_end = 2 * int(img_bin.shape[0] / 2)
        img_bin = img_bin[:even_end, :]
        index_even = np.where(np.arange(0, even_end) % 2)[0]
        index_odd = np.where(np.arange(1, even_end + 1) % 2)[0]
        img_bin = (img_bin[index_even, :] + img_bin[index_odd, :]) / 2

        level = level + 1
Ejemplo n.º 8
0
 def calculate_expected_counts(self, exps, sum_of_exps):
     sum_of_mats = csr_matrix(np.zeros(self.mat.shape)).T
     for mat, exp in zip(self.list_of_mats, exps):
         sum_of_mats += csr_matrix.multiply(mat.T, exp)
     return (csr_matrix.multiply(sum_of_mats,
                                 (1 / sum_of_exps))).sum(axis=1).T
Ejemplo n.º 9
0
if not method_inv:
    # -- method 1
    H = [];

    # finite-step random walk without memory
    Pi = P
    tmp = P
    for step in range(0, L):
        tmp = P.dot(tmp)
        Pi = Pi + tmp;
    Y = np.log(Pi.toarray()+10**(-30))

else:
    # -- method 2
    Pinv = inv( (csr_matrix(eye(n))-csr_matrix.multiply(P, alpha)).toarray() )
    Pi = np.divide( Pinv-csr_matrix(eye(n)), alpha )
    Y = np.log(Pi)

print "calculating SVD"

U, Sigma, VT = randomized_svd(Y, n_components=size, n_iter= iters, random_state=None)
Uw = U.dot( np.diag( np.sqrt(Sigma)) )

save_word2vec_format(output_file, Uw)

'''
print "===================================================="
print "result without top-k"
score(Uw, startfrom0=True, topk=True)
print "result with top-k"