Ejemplos de csr_scale_rows en Python, ejemplos de scipy.sparse.sparsetools.csr_scale_rows en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: ranker.py Proyecto: likejazz/simple-search-engine

    def build_rank(self, tweets):
        """Build tf-idf ranking score for terms in the corpus.

        Note:
          The code in this method could have been extracted to other smaller
          methods, improving legibility. This extraction has not been done so
          that its runtime complexity can be computed easily (the runtime
          complexity can be improved).

        Args:
          tweets (list of Indexable): List of indexed tweets that will be
            considered during tf-idf score computation.

        """
        self.__build_vocabulary(tweets)

        n_terms = len(self.vocabulary)
        n_docs = len(tweets)
        ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float))

        logging.info("[Ranker] Vocabulary assembled with terms count %s, docs count %s" \
            % ("{:,}".format(n_terms), "{:,}".format(n_docs)))

        # compute tf
        logging.info("[Ranker] Starting tf computation ...")
        for index, indexable in enumerate(tweets):
            for word in indexable.words_generator(self.stop_words):
                word_index_in_vocabulary = self.vocabulary[word]
                doc_word_count = indexable.count_for_word(word)
                ft_matrix[index, word_index_in_vocabulary] = doc_word_count
        # return a copy of this matrix in compressed sparse column format.
        self.ft_matrix = ft_matrix.tocsc()

        # compute idf with smoothing
        logging.info("[Ranker] Starting tf-idf computation ...")
        df = np.diff(self.ft_matrix.indptr) + self.smoothing
        n_docs_smooth = n_docs + self.smoothing

        # create diagonal matrix to be multiplied with ft
        idf = np.log(float(n_docs_smooth) / df) + 1.0
        self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms)

        # compute tf-idf
        self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix
        self.tf_idf_matrix = self.tf_idf_matrix.tocsr()

        # compute td-idf normalization
        norm = self.tf_idf_matrix.tocsr(copy=True)
        norm.data **= 2
        norm = norm.sum(axis=1)
        n_nzeros = np.where(norm > 0)
        norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
        norm = np.array(norm).T[0]
        sptools.csr_scale_rows(self.tf_idf_matrix.shape[0],
                                      self.tf_idf_matrix.shape[1],
                                      self.tf_idf_matrix.indptr,
                                      self.tf_idf_matrix.indices,
                                      self.tf_idf_matrix.data, norm)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: ranker.py Proyecto: Aditya-18/TrackThink

    def build_rank(self, tweets):
        """Build tf-idf ranking score for terms in the corpus.

        Note:
          The code in this method could have been extracted to other smaller
          methods, improving legibility. This extraction has not been done so
          that its runtime complexity can be computed easily (the runtime
          complexity can be improved).

        Args:
          tweets (list of Indexable): List of indexed tweets that will be
            considered during tf-idf score computation.

        """
        self.__build_vocabulary(tweets)

        n_terms = len(self.vocabulary)
        n_docs = len(tweets)
        ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float))

        logging.info("[Ranker] Vocabulary assembled with terms count %s, docs count %s" \
            % ("{:,}".format(n_terms), "{:,}".format(n_docs)))

        # compute tf
        logging.info("[Ranker] Starting tf computation ...")
        for index, indexable in enumerate(tweets):
            for word in indexable.words_generator(self.stop_words):
                word_index_in_vocabulary = self.vocabulary[word]
                doc_word_count = indexable.count_for_word(word)
                ft_matrix[index, word_index_in_vocabulary] = doc_word_count
        # return a copy of this matrix in compressed sparse column format.
        self.ft_matrix = ft_matrix.tocsc()

        # compute idf with smoothing
        logging.info("[Ranker] Starting tf-idf computation ...")
        df = np.diff(self.ft_matrix.indptr) + self.smoothing
        n_docs_smooth = n_docs + self.smoothing

        # create diagonal matrix to be multiplied with ft
        idf = np.log(float(n_docs_smooth) / df) + 1.0
        self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms)

        # compute tf-idf
        self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix
        self.tf_idf_matrix = self.tf_idf_matrix.tocsr()

        # compute td-idf normalization
        norm = self.tf_idf_matrix.tocsr(copy=True)
        norm.data **= 2
        norm = norm.sum(axis=1)
        n_nzeros = np.where(norm > 0)
        norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
        norm = np.array(norm).T[0]
        sptools.csr_scale_rows(self.tf_idf_matrix.shape[0],
                               self.tf_idf_matrix.shape[1],
                               self.tf_idf_matrix.indptr,
                               self.tf_idf_matrix.indices,
                               self.tf_idf_matrix.data, norm)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: trial_roulette.py Proyecto: mindis/HypTrails

def distr_chips_row(matrix,
                    chips,
                    n_jobs=-1,
                    norm=True,
                    dist_zero_rows=True,
                    mode="integers"):
    '''
    Trial roulette method for eliciting Dirichlet priors from expressed hypothesis matrix.
    This function works row-based. Thus, each row will receive the given number of chips!!!
    :param matrix: csr_matrix A_k expressing theory H_k
    :param chips: number of (single row) chips C to distribute
    :param n_jobs: number of jobs, default -1
    :param norm: set False if matrix does not need to be normalized (row-based)
    :param dist_zero_rows: if set to False, the method does not distribute chips to rows with only zeros (use with caution)
    :param mode: sets the mode of the distribution; "integers" means that the distributed pseudo clicks are integers;
                 "reals" means that the pseudo clicks (hyperparameters) can also be positive reals
    :return: Dirichlet hyperparameters in the shape of a matrix
    '''

    if mode not in ['integers', 'reals']:
        raise Exception, "Mode needs to be 'integers' or 'reals'!"

    chips = float(chips)

    if float(chips).is_integer() == False and mode == "integers":
        raise Exception, "If mode is 'integers' then only use integer chip counts!"

    if norm == True:
        norma = matrix.sum(axis=1)
        n_nzeros = np.where(norma > 0)
        n_zeros, _ = np.where(norma == 0)
        norma[n_nzeros] = 1.0 / norma[n_nzeros]
        norma = norma.T[0]
        csr_scale_rows(matrix.shape[0], matrix.shape[1], matrix.indptr,
                       matrix.indices, matrix.data, norma)

    if mode == "integers":
        r = Parallel(n_jobs=n_jobs)(delayed(distr_chips)(
            matrix[i, :], chips, dist_zero_matrix=dist_zero_rows, norm=False)
                                    for i in xrange(matrix.shape[0]))
        return scipy.sparse.vstack(r)

    if mode == "reals":
        matrix = matrix * chips

        if dist_zero_rows == True:
            # if some rows have 100% sparsity, we equally distribute the chips
            n, m = matrix.shape
            if norm == False:
                norma = matrix.sum(axis=1)
                n_zeros, _ = np.where(norma == 0)
            if len(n_zeros) > 0:
                #with numpy 1.10 dev, the next line is not needed
                if int(np.version.short_version.split(".")[1]) < 10:
                    n_zeros = np.array(n_zeros)[0]
                matrix[n_zeros, :] = chips / m

        return matrix

Ejemplo n.º 4

0

Mostrar archivo

Archivo: weighted_pagerank.py Proyecto: wangjs/wikilinks

def norm (hypothesis):
    hypothesis = hypothesis.copy()
    norma = hypothesis.sum(axis=1)
    n_nzeros = np.where(norma > 0)
    n_zeros,_ = np.where(norma == 0)
    norma[n_nzeros] = 1.0 / norma[n_nzeros]
    norma = norma.T[0]
    csr_scale_rows(hypothesis.shape[0], hypothesis.shape[1], hypothesis.indptr, hypothesis.indices, hypothesis.data, norma)
    return hypothesis

Ejemplo n.º 5

0

Mostrar archivo

def normalize(pairs):
    """
    Normalize rows of a csr matrix (sum to 1)
    """
    factor = pairs.sum(axis=1)
    nnzeros = np.where(factor > 0)
    factor[nnzeros] = 1 / factor[nnzeros]
    factor = np.array(factor)[0]
    if not pairs.format == "csr":
        raise ValueError("csr only")
    sparsetools.csr_scale_rows(pairs.shape[0], pairs.shape[1], pairs.indptr,
                               pairs.indices, pairs.data, factor)
    return pairs

Ejemplo n.º 6

0

Mostrar archivo

Archivo: proc.py Proyecto: SunnyWay/im2txtDemo

def normalize(pairs):
    """
    Normalize rows of a csr matrix (sum to 1)
    """
    factor = pairs.sum(axis=1)
    nnzeros = np.where(factor > 0)
    factor[nnzeros] = 1 / factor[nnzeros]
    factor = np.array(factor)[0]
    if not pairs.format == "csr":
         raise ValueError("csr only")
    sparsetools.csr_scale_rows(pairs.shape[0], pairs.shape[1], pairs.indptr,
        pairs.indices, pairs.data, factor)
    return pairs

Ejemplo n.º 7

0

Mostrar archivo

Archivo: trial_roulette.py Proyecto: LittleMissSunshine/HypTrails

def distr_chips_row(matrix, chips, n_jobs=-1, norm=True, dist_zero_rows=True, mode="integers"):
    '''
    Trial roulette method for eliciting Dirichlet priors from expressed hypothesis matrix.
    This function works row-based. Thus, each row will receive the given number of chips!!!
    :param matrix: csr_matrix A_k expressing theory H_k
    :param chips: number of (single row) chips C to distribute
    :param n_jobs: number of jobs, default -1
    :param norm: set False if matrix does not need to be normalized (row-based)
    :param dist_zero_rows: if set to False, the method does not distribute chips to rows with only zeros (use with caution)
    :param mode: sets the mode of the distribution; "integers" means that the distributed pseudo clicks are integers;
                 "reals" means that the pseudo clicks (hyperparameters) can also be positive reals
    :return: Dirichlet hyperparameters in the shape of a matrix
    '''

    if mode not in ['integers', 'reals']:
        raise Exception, "Mode needs to be 'integers' or 'reals'!"

    chips = float(chips)

    if float(chips).is_integer() == False and mode == "integers":
        raise Exception, "If mode is 'integers' then only use integer chip counts!"

    if norm == True:
        norma = matrix.sum(axis=1)
        n_nzeros = np.where(norma > 0)
        n_zeros,_ = np.where(norma == 0)
        norma[n_nzeros] = 1.0 / norma[n_nzeros]
        norma = norma.T[0]
        csr_scale_rows(matrix.shape[0], matrix.shape[1], matrix.indptr, matrix.indices,
                       matrix.data, norma)

    if mode == "integers":
        r = Parallel(n_jobs=n_jobs)(delayed(distr_chips)(matrix[i,:],chips,dist_zero_matrix=dist_zero_rows,norm=False) for i in xrange(matrix.shape[0]))
        return scipy.sparse.vstack(r)

    if mode == "reals":
        matrix = matrix * chips

        if dist_zero_rows == True:
            # if some rows have 100% sparsity, we equally distribute the chips
            n,m = matrix.shape
            if norm == False:
                norma = matrix.sum(axis=1)
                n_zeros,_ = np.where(norma == 0)
            if len(n_zeros) > 0:
                #with numpy 1.10 dev, the next line is not needed
                if int(np.version.short_version.split(".")[1]) < 10:
                    n_zeros = np.array(n_zeros)[0]
                matrix[n_zeros,:] = chips / m

        return matrix

Ejemplo n.º 8

0

Mostrar archivo

Archivo: l2_norm_sparse_csc_matrix.py Proyecto: kevinmel2000/sontekan

def l2_norm(sparse_csc_matrix):
    # convert csc_matrix to csr_matrix which is done in linear time
    norm = sparse_csc_matrix.tocsr(copy=True)

    # compute the inverse of l2 norm of non-zero elements
    norm.data **= 2
    norm = norm.sum(axis=1)
    n_nzeros = np.where(norm > 0)
    norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
    norm = np.array(norm).T[0]

    # modify sparse_csc_matrix in place
    csr_scale_rows(sparse_csc_matrix.shape[0], sparse_csc_matrix.shape[1],
                   sparse_csc_matrix.indptr, sparse_csc_matrix.indices,
                   sparse_csc_matrix.data, norm)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_spfuncs.py Proyecto: sprevrha/scipy-refactor

    def test_scale_rows_and_cols(self):
        D = matrix([[1,0,0,2,3],
                    [0,4,0,5,0],
                    [0,0,6,7,0]])


        #TODO expose through function
        S = csr_matrix(D)
        v = array([1,2,3])
        csr_scale_rows(3,5,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), diag(v)*D )

        S = csr_matrix(D)
        v = array([1,2,3,4,5])
        csr_scale_columns(3,5,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), D*diag(v) )

        # blocks
        E = kron(D,[[1,2],[3,4]])
        S = bsr_matrix(E,blocksize=(2,2))
        v = array([1,2,3,4,5,6])
        bsr_scale_rows(3,5,2,2,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), diag(v)*E )

        S = bsr_matrix(E,blocksize=(2,2))
        v = array([1,2,3,4,5,6,7,8,9,10])
        bsr_scale_columns(3,5,2,2,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), E*diag(v) )

        E = kron(D,[[1,2,3],[4,5,6]])
        S = bsr_matrix(E,blocksize=(2,3))
        v = array([1,2,3,4,5,6])
        bsr_scale_rows(3,5,2,3,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), diag(v)*E )

        S = bsr_matrix(E,blocksize=(2,3))
        v = array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])
        bsr_scale_columns(3,5,2,3,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), E*diag(v) )

Ejemplo n.º 10

0

Mostrar archivo

Archivo: search.py Proyecto: Sannidhya127/TFIDF-Rank-Based-Search-System

    def build_rank(self, objects):

        self.__build_vocabulary(objects)

        n_terms = len(self.vocabulary)
        n_docs = len(objects)
        ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float))

        for index, indexable in enumerate(objects):
            for word in indexable.words_generator(self.stop_words):
                word_index_in_vocabulary = self.vocabulary[word]
                doc_word_count = indexable.count_for_word(word)
                ft_matrix[index, word_index_in_vocabulary] = doc_word_count
        self.ft_matrix = ft_matrix.tocsc()

        logger.info(
            'Results will be displayed from higher to lower ranking...')

        df = np.diff(self.ft_matrix.indptr) + self.smoothing
        n_docs_smooth = n_docs + self.smoothing

        idf = np.log(float(n_docs_smooth) / df) + 1.0
        self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms)

        self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix
        self.tf_idf_matrix = self.tf_idf_matrix.tocsr()

        norm = self.tf_idf_matrix.tocsr(copy=True)
        norm.data **= 2
        norm = norm.sum(axis=1)
        n_nzeros = np.where(norm > 0)
        norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
        norm = np.array(norm).T[0]
        sptools.csr_scale_rows(self.tf_idf_matrix.shape[0],
                               self.tf_idf_matrix.shape[1],
                               self.tf_idf_matrix.indptr,
                               self.tf_idf_matrix.indices,
                               self.tf_idf_matrix.data, norm)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: lib_deconvolve_em.py Proyecto: yztxwd/cplate

def csr_scale_rows(A, x):
    sparsetools.csr_scale_rows(A.shape[0], A.shape[1], A.indptr, A.indices,
                               A.data, x)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: normalized_entropy.py Proyecto: trovdimi/wikilinks

def normalized_entropy():
    transition_matrix = pickle.load( open( SSD_HOME+"pickle/transition_matrix", "rb" ) )
    print "loaded transitions"
    graph = pickle.load( open( SSD_HOME+"pickle/graph", "rb" ) )
    print "loaded graph"
    values = pickle.load( open( SSD_HOME+"pickle/values", "rb" ) )


    vocab = pickle.load( open( SSD_HOME+"pickle/vocab", "rb" ) )
    print "loaded vocab"

    state_count = len(vocab)
    states = vocab.keys()
    shape = (state_count, state_count)



    # structural hypothesises
    hyp_structural = csr_matrix((values, (graph[0], graph[1])),
                                shape=shape, dtype=np.float)



    transitions = csr_matrix((transition_matrix[2], (transition_matrix[0], transition_matrix[1])),
                             shape=shape)

    del transition_matrix

    #delete all zero rows from all  see  http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero
    print transitions.shape
    nonzero_row_indice, _ = transitions.nonzero()
    unique_nonzero_indice = np.unique(nonzero_row_indice)
    transitions = transitions[unique_nonzero_indice]
    print transitions.shape

    hyp_data = csr_matrix(transitions, copy=True)
    print hyp_data.shape

    hyp_structural = hyp_structural[unique_nonzero_indice]
    #norm the data
    norm_h = hyp_data.sum(axis=1)
    n_nzeros = np.where(norm_h > 0)
    norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
    norm_h = np.array(norm_h).T[0]

    csr_scale_rows(hyp_data.shape[0],
                   hyp_data.shape[1],
                   hyp_data.indptr,
                   hyp_data.indices,
                   hyp_data.data, norm_h)

    #calculate the entropy for a row
    #entropy = np.apply_along_axis( entropy_step, axis=1, arr=hyp_data )
    entropy =[]
    c=0
    for i in range(0,hyp_data.shape[0]):
        c+=1
        if c % 100000 == 0:
            print c
        x = hyp_data.getrow(i)
        entropy.append(entropy_step(x))

    print "entropy"
    #number of link for a row, needed for normalization fo the
    norm_h = hyp_structural.sum(axis=1)
    #print norm_h
    normalized_entropy = []
    for i, x in enumerate(entropy):
        if i % 100000 == 0:
            print i

        if math.log(norm_h[i][0])==0:
            normalized_entropy.append(0)
        else:
            e = x/math.log(norm_h[i][0])
            normalized_entropy.append(e)

    print "normed entropy"
    write_pickle('output/normalized_entropy.obj',normalized_entropy)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: normalized_entropy.py Proyecto: trovdimi/wikilinks

def gini_random_rows(row_normed):
    print "gini"
    print row_normed
    transition_matrix = pickle.load( open( SSD_HOME+"pickle/transition_matrix", "rb" ) )
    print "loaded transitions"
    graph = pickle.load( open( SSD_HOME+"pickle/graph", "rb" ) )
    print "loaded graph"
    values = pickle.load( open( SSD_HOME+"pickle/values", "rb" ) )


    vocab = pickle.load( open( SSD_HOME+"pickle/vocab", "rb" ) )
    print "loaded vocab"

    state_count = len(vocab)
    states = vocab.keys()
    shape = (state_count, state_count)



    # structural hypothesises
    hyp_structural = csr_matrix((values, (graph[0], graph[1])),
                                shape=shape, dtype=np.float)



    transitions = csr_matrix((transition_matrix[2], (transition_matrix[0], transition_matrix[1])),
                             shape=shape)

    del transition_matrix

    #delete all zero rows from all  see  http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero
    print transitions.shape
    nonzero_row_indice, _ = transitions.nonzero()
    unique_nonzero_indice = np.unique(nonzero_row_indice)
    transitions = transitions[unique_nonzero_indice]
    print transitions.shape

    hyp_data = csr_matrix(transitions, copy=True)
    print hyp_data.shape

    hyp_structural = hyp_structural[unique_nonzero_indice]
    if row_normed:
        #norm the data
        norm_h = hyp_data.sum(axis=1)
        n_nzeros = np.where(norm_h > 0)
        norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
        norm_h = np.array(norm_h).T[0]

        csr_scale_rows(hyp_data.shape[0],
                       hyp_data.shape[1],
                       hyp_data.indptr,
                       hyp_data.indices,
                       hyp_data.data, norm_h)

    #calculate the gini for a row

    gini =[]
    c=0
    #for i in range(0,hyp_data.shape[0]):

    import random
    for i in random.sample(range(0,hyp_data.shape[0]), 10000):
        c+=1
        if c % 1000 == 0:
            print c
        counts = hyp_data.getrow(i).toarray()[0]
        links = hyp_structural.getrow(i).toarray()[0]
        indices_of_links =  links > 0
        gini_data = counts[indices_of_links]
        gini.append(gini_step(gini_data))

    print "gini"
    if row_normed:
        write_pickle('output/gini_random_rows_row_normed.obj',gini)
    else:
        write_pickle('output/gini_random_rows.obj',gini)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: tfidf.py Proyecto: linksuccess/linksuccess

# f = open('X_indptr.p', 'wb')
# cPickle.dump(X.indptr, f, protocol=-1)

#X = normalize(X)

# compute the inverse of l2 norm of non-zero elements
X.data **= 2
norm = X.sum(axis=1)
n_nzeros = np.where(norm > 0)
norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
norm = np.array(norm).T[0]
X.data = np.sqrt(X.data)
# modify sparse_csc_matrix in place
sparsetools.csr_scale_rows(X.shape[0],
                              X.shape[1],
                              X.indptr,
                              X.indices,
                              X.data, norm)

print X.shape
print X[0,:].sum()

sparse_save(X,"../data/tfidf_norm.h5")

# f = open('X_norm_data.p', 'wb')
# cPickle.dump(X.data, f, protocol=-1)
# f = open('X__norm_indices.p', 'wb')
# cPickle.dump(X.indices, f, protocol=-1)
# f = open('X_norm_indptr.p', 'wb')
# cPickle.dump(X.indptr, f, protocol=-1)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: lib_deconvolve_em.py Proyecto: airoldilab/cplate

def csr_scale_rows(A, x):
    sparsetools.csr_scale_rows(A.shape[0], A.shape[1],
                               A.indptr, A.indices, A.data,
                               x)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: HypTrails.py Proyecto: linksuccess/linksuccess

    def evidence(self, hypothesis, structur, k=1, prior=1., norm=True):
        """
        Determines Bayesian evidence given fitted model and hypothesis

        Args:
            hypothesis: Hypothesis csr matrix,
                        indices need to map those of transition matrix
            k: Concentration parameter k
            prior: proto Dirichlet prior
            norm: Flag for normalizing hypothesis matrix
        Returns
            evidence
        """

        # care with copy here
        hypothesis = csr_matrix(hypothesis, copy=True)

        structur = csr_matrix(structur, copy=True)

        pseudo_counts = k * self.state_count

        if hypothesis.size != 0:
            # in case of memory issues set copy to False but then care about changed hypothesis matrix
            if norm == True:
                #print "in norm"
                norm_h = hypothesis.sum(axis=1)
                n_nzeros = np.where(norm_h > 0)
                norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
                norm_h = np.array(norm_h).T[0]
                #print "in place mod"
                # modify sparse_csc_matrix in place
                csr_scale_rows(hypothesis.shape[0], hypothesis.shape[1],
                               hypothesis.indptr, hypothesis.indices,
                               hypothesis.data, norm_h)

                # distribute pseudo counts to matrix, row-based approach
                hypothesis = hypothesis * pseudo_counts
                #print "after pseude counts"
                # also consider those rows which only include zeros
                norma = hypothesis.sum(axis=1)
                n_zeros, _ = np.where(norma == 0)
                hypothesis[n_zeros, :] = pseudo_counts / self.state_count
            else:
                #print "in norm"
                norm_h = hypothesis.sum(axis=1)
                n_nzeros = np.where(norm_h > 0)
                norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
                norm_h = np.array(norm_h).T[0]
                #print "in place mod"
                # modify sparse_csc_matrix in place
                csr_scale_rows(hypothesis.shape[0], hypothesis.shape[1],
                               hypothesis.indptr, hypothesis.indices,
                               hypothesis.data, norm_h)

                # distribute pseudo counts to matrix, row-based approach
                #TODO check if this line should be placed after the zero_rows_norm() call????
                hypothesis = hypothesis * pseudo_counts

                #self.zero_rows_norm(hypothesis, structur,k)
                self.zero_rows_norm_eff1(hypothesis, structur, k)

        else:
            # if hypothesis matrix is empty, we can simply increase the proto prior parameter
            prior += k

        # transition matrix with additional Dirichlet prior
        # not memory efficient
        transitions_prior = self.transitions.copy()
        transitions_prior = transitions_prior + hypothesis
        #print "after copy"
        # elegantly calculate evidence
        evidence = 0
        evidence += gammaln(hypothesis.sum(axis=1) +
                            self.state_count * prior).sum()
        evidence -= gammaln(
            self.transitions.sum(axis=1) + hypothesis.sum(axis=1) +
            self.state_count * prior).sum()
        evidence += gammaln(transitions_prior.data + prior).sum()
        evidence -= gammaln(hypothesis.data + prior).sum() + (len(
            transitions_prior.data) - len(hypothesis.data)) * gammaln(prior)
        return evidence

Ejemplo n.º 17

0

Mostrar archivo

def normalized_entropy():
    transition_matrix = pickle.load(
        open(SSD_HOME + "pickle/transition_matrix", "rb"))
    print "loaded transitions"
    graph = pickle.load(open(SSD_HOME + "pickle/graph", "rb"))
    print "loaded graph"
    values = pickle.load(open(SSD_HOME + "pickle/values", "rb"))

    vocab = pickle.load(open(SSD_HOME + "pickle/vocab", "rb"))
    print "loaded vocab"

    state_count = len(vocab)
    states = vocab.keys()
    shape = (state_count, state_count)

    # structural hypothesises
    hyp_structural = csr_matrix((values, (graph[0], graph[1])),
                                shape=shape,
                                dtype=np.float)

    transitions = csr_matrix(
        (transition_matrix[2], (transition_matrix[0], transition_matrix[1])),
        shape=shape)

    del transition_matrix

    #delete all zero rows from all  see  http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero
    print transitions.shape
    nonzero_row_indice, _ = transitions.nonzero()
    unique_nonzero_indice = np.unique(nonzero_row_indice)
    transitions = transitions[unique_nonzero_indice]
    print transitions.shape

    hyp_data = csr_matrix(transitions, copy=True)
    print hyp_data.shape

    hyp_structural = hyp_structural[unique_nonzero_indice]
    #norm the data
    norm_h = hyp_data.sum(axis=1)
    n_nzeros = np.where(norm_h > 0)
    norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
    norm_h = np.array(norm_h).T[0]

    csr_scale_rows(hyp_data.shape[0], hyp_data.shape[1], hyp_data.indptr,
                   hyp_data.indices, hyp_data.data, norm_h)

    #calculate the entropy for a row
    #entropy = np.apply_along_axis( entropy_step, axis=1, arr=hyp_data )
    entropy = []
    c = 0
    for i in range(0, hyp_data.shape[0]):
        c += 1
        if c % 100000 == 0:
            print c
        x = hyp_data.getrow(i)
        entropy.append(entropy_step(x))

    print "entropy"
    #number of link for a row, needed for normalization fo the
    norm_h = hyp_structural.sum(axis=1)
    #print norm_h
    normalized_entropy = []
    for i, x in enumerate(entropy):
        if i % 100000 == 0:
            print i

        if math.log(norm_h[i][0]) == 0:
            normalized_entropy.append(0)
        else:
            e = x / math.log(norm_h[i][0])
            normalized_entropy.append(e)

    print "normed entropy"
    write_pickle('output/normalized_entropy.obj', normalized_entropy)

Ejemplo n.º 18

0

Mostrar archivo

def gini_random_rows(row_normed):
    print "gini"
    print row_normed
    transition_matrix = pickle.load(
        open(SSD_HOME + "pickle/transition_matrix", "rb"))
    print "loaded transitions"
    graph = pickle.load(open(SSD_HOME + "pickle/graph", "rb"))
    print "loaded graph"
    values = pickle.load(open(SSD_HOME + "pickle/values", "rb"))

    vocab = pickle.load(open(SSD_HOME + "pickle/vocab", "rb"))
    print "loaded vocab"

    state_count = len(vocab)
    states = vocab.keys()
    shape = (state_count, state_count)

    # structural hypothesises
    hyp_structural = csr_matrix((values, (graph[0], graph[1])),
                                shape=shape,
                                dtype=np.float)

    transitions = csr_matrix(
        (transition_matrix[2], (transition_matrix[0], transition_matrix[1])),
        shape=shape)

    del transition_matrix

    #delete all zero rows from all  see  http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero
    print transitions.shape
    nonzero_row_indice, _ = transitions.nonzero()
    unique_nonzero_indice = np.unique(nonzero_row_indice)
    transitions = transitions[unique_nonzero_indice]
    print transitions.shape

    hyp_data = csr_matrix(transitions, copy=True)
    print hyp_data.shape

    hyp_structural = hyp_structural[unique_nonzero_indice]
    if row_normed:
        #norm the data
        norm_h = hyp_data.sum(axis=1)
        n_nzeros = np.where(norm_h > 0)
        norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
        norm_h = np.array(norm_h).T[0]

        csr_scale_rows(hyp_data.shape[0], hyp_data.shape[1], hyp_data.indptr,
                       hyp_data.indices, hyp_data.data, norm_h)

    #calculate the gini for a row

    gini = []
    c = 0
    #for i in range(0,hyp_data.shape[0]):

    import random
    for i in random.sample(range(0, hyp_data.shape[0]), 10000):
        c += 1
        if c % 1000 == 0:
            print c
        counts = hyp_data.getrow(i).toarray()[0]
        links = hyp_structural.getrow(i).toarray()[0]
        indices_of_links = links > 0
        gini_data = counts[indices_of_links]
        gini.append(gini_step(gini_data))

    print "gini"
    if row_normed:
        write_pickle('output/gini_random_rows_row_normed.obj', gini)
    else:
        write_pickle('output/gini_random_rows.obj', gini)

Ejemplo n.º 19

0

Mostrar archivo

Archivo: HypTrails.py Proyecto: trovdimi/wikilinks

    def evidence(self, hypothesis, structur, k=1, prior=1., norm=True):
        """
        Determines Bayesian evidence given fitted model and hypothesis

        Args:
            hypothesis: Hypothesis csr matrix,
                        indices need to map those of transition matrix
            k: Concentration parameter k
            prior: proto Dirichlet prior
            norm: Flag for normalizing hypothesis matrix
        Returns
            evidence
        """

        # care with copy here
        hypothesis = csr_matrix(hypothesis, copy=True)

        structur = csr_matrix(structur, copy=True)

        pseudo_counts = k * self.state_count

        if hypothesis.size != 0:
            # in case of memory issues set copy to False but then care about changed hypothesis matrix
            if norm == True:
                #print "in norm"
                norm_h = hypothesis.sum(axis=1)
                n_nzeros = np.where(norm_h > 0)
                norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
                norm_h = np.array(norm_h).T[0]
                #print "in place mod"
                # modify sparse_csc_matrix in place
                csr_scale_rows(hypothesis.shape[0],
                                  hypothesis.shape[1],
                                  hypothesis.indptr,
                                  hypothesis.indices,
                                  hypothesis.data, norm_h)


                # distribute pseudo counts to matrix, row-based approach
                hypothesis = hypothesis * pseudo_counts
                #print "after pseude counts"
                # also consider those rows which only include zeros
                norma = hypothesis.sum(axis=1)
                n_zeros,_ = np.where(norma == 0)
                hypothesis[n_zeros,:] = pseudo_counts / self.state_count
            else:
                #print "in norm"
                norm_h = hypothesis.sum(axis=1)
                n_nzeros = np.where(norm_h > 0)
                norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
                norm_h = np.array(norm_h).T[0]
                #print "in place mod"
                # modify sparse_csc_matrix in place
                csr_scale_rows(hypothesis.shape[0],
                               hypothesis.shape[1],
                               hypothesis.indptr,
                               hypothesis.indices,
                               hypothesis.data, norm_h)


                # distribute pseudo counts to matrix, row-based approach
                #TODO check if this line should be placed after the zero_rows_norm() call????
                hypothesis = hypothesis * pseudo_counts

                #self.zero_rows_norm(hypothesis, structur,k)
                self.zero_rows_norm_eff1(hypothesis, structur, k)

        else:
            # if hypothesis matrix is empty, we can simply increase the proto prior parameter
            prior += k

        # transition matrix with additional Dirichlet prior
        # not memory efficient
        transitions_prior = self.transitions.copy()
        transitions_prior = transitions_prior + hypothesis
        #print "after copy"
        # elegantly calculate evidence
        evidence = 0
        evidence += gammaln(hypothesis.sum(axis=1)+self.state_count*prior).sum()
        evidence -= gammaln(self.transitions.sum(axis=1)+hypothesis.sum(axis=1)+self.state_count*prior).sum()
        evidence += gammaln(transitions_prior.data+prior).sum()
        evidence -= gammaln(hypothesis.data+prior).sum() + (len(transitions_prior.data)-len(hypothesis.data)) * gammaln(prior)
        return evidence

Ejemplo n.º 20

0

Mostrar archivo

Archivo: search.py Proyecto: Victor0118/Information_Retrieval_Project

    def build_rank(self, objects,isBinaryWord=False):
        """Build tf-idf ranking score for terms in the corpus.

        Note:
          The code in this method could have been extracted to other smaller
          methods, improving legibility. This extraction has not been done so
          that its runtime complexity can be computed easily (the runtime
          complexity can be improved).

        Args:
          objects (list of Indexable): List of indexed objects that will be
            considered during tf-idf score computation.

        """
        self.__build_vocabulary(objects,isBinaryWord)

        n_terms = len(self.vocabulary)
        n_docs = len(objects)
        ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float))

        logger.info('Vocabulary assembled with terms count %s', n_terms)

        # compute idf
        logger.info('Starting tf computation...')
        for index, indexable in enumerate(objects):
            for word in indexable.words_generator(self.stop_words):
                word_index_in_vocabulary = self.vocabulary[word]
                doc_word_count = indexable.count_for_word(word)
                ft_matrix[index, word_index_in_vocabulary] = doc_word_count
        
        # Add synword's idf

        # for word in self.vocabulary_withoutsynword.keys():
        #     for synword in sy.synonymwords(word)[0:4]:
        #         word_index_in_vocabulary = self.vocabulary[word]
        #         synword_index_in_vocabulary = self.vocabulary[synword]
        #         if synword not in self.vocabulary_withoutsynword.keys():
        #             #print "origin word: ", word," synword: ",synword
        #             ft_matrix[:,synword_index_in_vocabulary] = ft_matrix[:,word_index_in_vocabulary]
        #         elif synword != word:
        #             newmatrix1 = 0.6*ft_matrix[:,word_index_in_vocabulary]+0.4*ft_matrix[:,synword_index_in_vocabulary]
        #             newmatrix2 = 0.4*ft_matrix[:,word_index_in_vocabulary]+0.6*ft_matrix[:,synword_index_in_vocabulary]
        #             ft_matrix[:,word_index_in_vocabulary] = newmatrix1
        #             ft_matrix[:,synword_index_in_vocabulary] = newmatrix2



        self.ft_matrix = ft_matrix.tocsc()

        logger.info('Starting tf-idf computation...')
        # compute idf with smoothing
        df = np.diff(self.ft_matrix.indptr) + self.smoothing
        n_docs_smooth = n_docs + self.smoothing

        # create diagonal matrix to be multiplied with ft
        idf = np.log(float(n_docs_smooth) / df) + 1.0
        self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms)

        # compute tf-idf
        self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix
        self.tf_idf_matrix = self.tf_idf_matrix.tocsr()

        # compute td-idf normalization
        norm = self.tf_idf_matrix.tocsr(copy=True)
        norm.data **= 2
        norm = norm.sum(axis=1)
        n_nzeros = np.where(norm > 0)
        norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
        norm = np.array(norm).T[0]
        sptools.csr_scale_rows(self.tf_idf_matrix.shape[0],
                                      self.tf_idf_matrix.shape[1],
                                      self.tf_idf_matrix.indptr,
                                      self.tf_idf_matrix.indices,
                                      self.tf_idf_matrix.data, norm)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: tfidf.py Proyecto: wangjs/wikilinks

# f = open('X_indices.p', 'wb')
# cPickle.dump(X.indices, f, protocol=-1)
# f = open('X_indptr.p', 'wb')
# cPickle.dump(X.indptr, f, protocol=-1)

#X = normalize(X)

# compute the inverse of l2 norm of non-zero elements
X.data **= 2
norm = X.sum(axis=1)
n_nzeros = np.where(norm > 0)
norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
norm = np.array(norm).T[0]
X.data = np.sqrt(X.data)
# modify sparse_csc_matrix in place
sparsetools.csr_scale_rows(X.shape[0], X.shape[1], X.indptr, X.indices, X.data,
                           norm)

print X.shape
print X[0, :].sum()

sparse_save(X, "../data/tfidf_norm.h5")

# f = open('X_norm_data.p', 'wb')
# cPickle.dump(X.data, f, protocol=-1)
# f = open('X__norm_indices.p', 'wb')
# cPickle.dump(X.indices, f, protocol=-1)
# f = open('X_norm_indptr.p', 'wb')
# cPickle.dump(X.indptr, f, protocol=-1)

#matrix = X.dot(X.T)

Ejemplo n.º 22

0

Mostrar archivo

    def build_rank(self, objects, isBinaryWord=False):
        """Build tf-idf ranking score for terms in the corpus.

        Note:
          The code in this method could have been extracted to other smaller
          methods, improving legibility. This extraction has not been done so
          that its runtime complexity can be computed easily (the runtime
          complexity can be improved).

        Args:
          objects (list of Indexable): List of indexed objects that will be
            considered during tf-idf score computation.

        """
        self.__build_vocabulary(objects, isBinaryWord)

        n_terms = len(self.vocabulary)
        n_docs = len(objects)
        ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float))

        logger.info('Vocabulary assembled with terms count %s', n_terms)

        # compute idf
        logger.info('Starting tf computation...')
        for index, indexable in enumerate(objects):
            for word in indexable.words_generator(self.stop_words):
                word_index_in_vocabulary = self.vocabulary[word]
                doc_word_count = indexable.count_for_word(word)
                ft_matrix[index, word_index_in_vocabulary] = doc_word_count

        # Add synword's idf

        # for word in self.vocabulary_withoutsynword.keys():
        #     for synword in sy.synonymwords(word)[0:4]:
        #         word_index_in_vocabulary = self.vocabulary[word]
        #         synword_index_in_vocabulary = self.vocabulary[synword]
        #         if synword not in self.vocabulary_withoutsynword.keys():
        #             #print "origin word: ", word," synword: ",synword
        #             ft_matrix[:,synword_index_in_vocabulary] = ft_matrix[:,word_index_in_vocabulary]
        #         elif synword != word:
        #             newmatrix1 = 0.6*ft_matrix[:,word_index_in_vocabulary]+0.4*ft_matrix[:,synword_index_in_vocabulary]
        #             newmatrix2 = 0.4*ft_matrix[:,word_index_in_vocabulary]+0.6*ft_matrix[:,synword_index_in_vocabulary]
        #             ft_matrix[:,word_index_in_vocabulary] = newmatrix1
        #             ft_matrix[:,synword_index_in_vocabulary] = newmatrix2

        self.ft_matrix = ft_matrix.tocsc()

        logger.info('Starting tf-idf computation...')
        # compute idf with smoothing
        df = np.diff(self.ft_matrix.indptr) + self.smoothing
        n_docs_smooth = n_docs + self.smoothing

        # create diagonal matrix to be multiplied with ft
        idf = np.log(float(n_docs_smooth) / df) + 1.0
        self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms)

        # compute tf-idf
        self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix
        self.tf_idf_matrix = self.tf_idf_matrix.tocsr()

        # compute td-idf normalization
        norm = self.tf_idf_matrix.tocsr(copy=True)
        norm.data **= 2
        norm = norm.sum(axis=1)
        n_nzeros = np.where(norm > 0)
        norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
        norm = np.array(norm).T[0]
        sptools.csr_scale_rows(self.tf_idf_matrix.shape[0],
                               self.tf_idf_matrix.shape[1],
                               self.tf_idf_matrix.indptr,
                               self.tf_idf_matrix.indices,
                               self.tf_idf_matrix.data, norm)