Ejemplo n.º 1
0
    def build_rank(self, tweets):
        """Build tf-idf ranking score for terms in the corpus.

        Note:
          The code in this method could have been extracted to other smaller
          methods, improving legibility. This extraction has not been done so
          that its runtime complexity can be computed easily (the runtime
          complexity can be improved).

        Args:
          tweets (list of Indexable): List of indexed tweets that will be
            considered during tf-idf score computation.

        """
        self.__build_vocabulary(tweets)

        n_terms = len(self.vocabulary)
        n_docs = len(tweets)
        ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float))

        logging.info("[Ranker] Vocabulary assembled with terms count %s, docs count %s" \
            % ("{:,}".format(n_terms), "{:,}".format(n_docs)))

        # compute tf
        logging.info("[Ranker] Starting tf computation ...")
        for index, indexable in enumerate(tweets):
            for word in indexable.words_generator(self.stop_words):
                word_index_in_vocabulary = self.vocabulary[word]
                doc_word_count = indexable.count_for_word(word)
                ft_matrix[index, word_index_in_vocabulary] = doc_word_count
        # return a copy of this matrix in compressed sparse column format.
        self.ft_matrix = ft_matrix.tocsc()

        # compute idf with smoothing
        logging.info("[Ranker] Starting tf-idf computation ...")
        df = np.diff(self.ft_matrix.indptr) + self.smoothing
        n_docs_smooth = n_docs + self.smoothing

        # create diagonal matrix to be multiplied with ft
        idf = np.log(float(n_docs_smooth) / df) + 1.0
        self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms)

        # compute tf-idf
        self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix
        self.tf_idf_matrix = self.tf_idf_matrix.tocsr()

        # compute td-idf normalization
        norm = self.tf_idf_matrix.tocsr(copy=True)
        norm.data **= 2
        norm = norm.sum(axis=1)
        n_nzeros = np.where(norm > 0)
        norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
        norm = np.array(norm).T[0]
        sptools.csr_scale_rows(self.tf_idf_matrix.shape[0],
                                      self.tf_idf_matrix.shape[1],
                                      self.tf_idf_matrix.indptr,
                                      self.tf_idf_matrix.indices,
                                      self.tf_idf_matrix.data, norm)
Ejemplo n.º 2
0
    def build_rank(self, tweets):
        """Build tf-idf ranking score for terms in the corpus.

        Note:
          The code in this method could have been extracted to other smaller
          methods, improving legibility. This extraction has not been done so
          that its runtime complexity can be computed easily (the runtime
          complexity can be improved).

        Args:
          tweets (list of Indexable): List of indexed tweets that will be
            considered during tf-idf score computation.

        """
        self.__build_vocabulary(tweets)

        n_terms = len(self.vocabulary)
        n_docs = len(tweets)
        ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float))

        logging.info("[Ranker] Vocabulary assembled with terms count %s, docs count %s" \
            % ("{:,}".format(n_terms), "{:,}".format(n_docs)))

        # compute tf
        logging.info("[Ranker] Starting tf computation ...")
        for index, indexable in enumerate(tweets):
            for word in indexable.words_generator(self.stop_words):
                word_index_in_vocabulary = self.vocabulary[word]
                doc_word_count = indexable.count_for_word(word)
                ft_matrix[index, word_index_in_vocabulary] = doc_word_count
        # return a copy of this matrix in compressed sparse column format.
        self.ft_matrix = ft_matrix.tocsc()

        # compute idf with smoothing
        logging.info("[Ranker] Starting tf-idf computation ...")
        df = np.diff(self.ft_matrix.indptr) + self.smoothing
        n_docs_smooth = n_docs + self.smoothing

        # create diagonal matrix to be multiplied with ft
        idf = np.log(float(n_docs_smooth) / df) + 1.0
        self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms)

        # compute tf-idf
        self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix
        self.tf_idf_matrix = self.tf_idf_matrix.tocsr()

        # compute td-idf normalization
        norm = self.tf_idf_matrix.tocsr(copy=True)
        norm.data **= 2
        norm = norm.sum(axis=1)
        n_nzeros = np.where(norm > 0)
        norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
        norm = np.array(norm).T[0]
        sptools.csr_scale_rows(self.tf_idf_matrix.shape[0],
                               self.tf_idf_matrix.shape[1],
                               self.tf_idf_matrix.indptr,
                               self.tf_idf_matrix.indices,
                               self.tf_idf_matrix.data, norm)
Ejemplo n.º 3
0
def distr_chips_row(matrix,
                    chips,
                    n_jobs=-1,
                    norm=True,
                    dist_zero_rows=True,
                    mode="integers"):
    '''
    Trial roulette method for eliciting Dirichlet priors from expressed hypothesis matrix.
    This function works row-based. Thus, each row will receive the given number of chips!!!
    :param matrix: csr_matrix A_k expressing theory H_k
    :param chips: number of (single row) chips C to distribute
    :param n_jobs: number of jobs, default -1
    :param norm: set False if matrix does not need to be normalized (row-based)
    :param dist_zero_rows: if set to False, the method does not distribute chips to rows with only zeros (use with caution)
    :param mode: sets the mode of the distribution; "integers" means that the distributed pseudo clicks are integers;
                 "reals" means that the pseudo clicks (hyperparameters) can also be positive reals
    :return: Dirichlet hyperparameters in the shape of a matrix
    '''

    if mode not in ['integers', 'reals']:
        raise Exception, "Mode needs to be 'integers' or 'reals'!"

    chips = float(chips)

    if float(chips).is_integer() == False and mode == "integers":
        raise Exception, "If mode is 'integers' then only use integer chip counts!"

    if norm == True:
        norma = matrix.sum(axis=1)
        n_nzeros = np.where(norma > 0)
        n_zeros, _ = np.where(norma == 0)
        norma[n_nzeros] = 1.0 / norma[n_nzeros]
        norma = norma.T[0]
        csr_scale_rows(matrix.shape[0], matrix.shape[1], matrix.indptr,
                       matrix.indices, matrix.data, norma)

    if mode == "integers":
        r = Parallel(n_jobs=n_jobs)(delayed(distr_chips)(
            matrix[i, :], chips, dist_zero_matrix=dist_zero_rows, norm=False)
                                    for i in xrange(matrix.shape[0]))
        return scipy.sparse.vstack(r)

    if mode == "reals":
        matrix = matrix * chips

        if dist_zero_rows == True:
            # if some rows have 100% sparsity, we equally distribute the chips
            n, m = matrix.shape
            if norm == False:
                norma = matrix.sum(axis=1)
                n_zeros, _ = np.where(norma == 0)
            if len(n_zeros) > 0:
                #with numpy 1.10 dev, the next line is not needed
                if int(np.version.short_version.split(".")[1]) < 10:
                    n_zeros = np.array(n_zeros)[0]
                matrix[n_zeros, :] = chips / m

        return matrix
Ejemplo n.º 4
0
def norm (hypothesis):
    hypothesis = hypothesis.copy()
    norma = hypothesis.sum(axis=1)
    n_nzeros = np.where(norma > 0)
    n_zeros,_ = np.where(norma == 0)
    norma[n_nzeros] = 1.0 / norma[n_nzeros]
    norma = norma.T[0]
    csr_scale_rows(hypothesis.shape[0], hypothesis.shape[1], hypothesis.indptr, hypothesis.indices, hypothesis.data, norma)
    return hypothesis
Ejemplo n.º 5
0
def normalize(pairs):
    """
    Normalize rows of a csr matrix (sum to 1)
    """
    factor = pairs.sum(axis=1)
    nnzeros = np.where(factor > 0)
    factor[nnzeros] = 1 / factor[nnzeros]
    factor = np.array(factor)[0]
    if not pairs.format == "csr":
        raise ValueError("csr only")
    sparsetools.csr_scale_rows(pairs.shape[0], pairs.shape[1], pairs.indptr,
                               pairs.indices, pairs.data, factor)
    return pairs
Ejemplo n.º 6
0
def normalize(pairs):
    """
    Normalize rows of a csr matrix (sum to 1)
    """
    factor = pairs.sum(axis=1)
    nnzeros = np.where(factor > 0)
    factor[nnzeros] = 1 / factor[nnzeros]
    factor = np.array(factor)[0]
    if not pairs.format == "csr":
         raise ValueError("csr only")
    sparsetools.csr_scale_rows(pairs.shape[0], pairs.shape[1], pairs.indptr,
        pairs.indices, pairs.data, factor)
    return pairs
Ejemplo n.º 7
0
def distr_chips_row(matrix, chips, n_jobs=-1, norm=True, dist_zero_rows=True, mode="integers"):
    '''
    Trial roulette method for eliciting Dirichlet priors from expressed hypothesis matrix.
    This function works row-based. Thus, each row will receive the given number of chips!!!
    :param matrix: csr_matrix A_k expressing theory H_k
    :param chips: number of (single row) chips C to distribute
    :param n_jobs: number of jobs, default -1
    :param norm: set False if matrix does not need to be normalized (row-based)
    :param dist_zero_rows: if set to False, the method does not distribute chips to rows with only zeros (use with caution)
    :param mode: sets the mode of the distribution; "integers" means that the distributed pseudo clicks are integers;
                 "reals" means that the pseudo clicks (hyperparameters) can also be positive reals
    :return: Dirichlet hyperparameters in the shape of a matrix
    '''

    if mode not in ['integers', 'reals']:
        raise Exception, "Mode needs to be 'integers' or 'reals'!"

    chips = float(chips)

    if float(chips).is_integer() == False and mode == "integers":
        raise Exception, "If mode is 'integers' then only use integer chip counts!"

    if norm == True:
        norma = matrix.sum(axis=1)
        n_nzeros = np.where(norma > 0)
        n_zeros,_ = np.where(norma == 0)
        norma[n_nzeros] = 1.0 / norma[n_nzeros]
        norma = norma.T[0]
        csr_scale_rows(matrix.shape[0], matrix.shape[1], matrix.indptr, matrix.indices,
                       matrix.data, norma)

    if mode == "integers":
        r = Parallel(n_jobs=n_jobs)(delayed(distr_chips)(matrix[i,:],chips,dist_zero_matrix=dist_zero_rows,norm=False) for i in xrange(matrix.shape[0]))
        return scipy.sparse.vstack(r)

    if mode == "reals":
        matrix = matrix * chips

        if dist_zero_rows == True:
            # if some rows have 100% sparsity, we equally distribute the chips
            n,m = matrix.shape
            if norm == False:
                norma = matrix.sum(axis=1)
                n_zeros,_ = np.where(norma == 0)
            if len(n_zeros) > 0:
                #with numpy 1.10 dev, the next line is not needed
                if int(np.version.short_version.split(".")[1]) < 10:
                    n_zeros = np.array(n_zeros)[0]
                matrix[n_zeros,:] = chips / m

        return matrix
def l2_norm(sparse_csc_matrix):
    # convert csc_matrix to csr_matrix which is done in linear time
    norm = sparse_csc_matrix.tocsr(copy=True)

    # compute the inverse of l2 norm of non-zero elements
    norm.data **= 2
    norm = norm.sum(axis=1)
    n_nzeros = np.where(norm > 0)
    norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
    norm = np.array(norm).T[0]

    # modify sparse_csc_matrix in place
    csr_scale_rows(sparse_csc_matrix.shape[0], sparse_csc_matrix.shape[1],
                   sparse_csc_matrix.indptr, sparse_csc_matrix.indices,
                   sparse_csc_matrix.data, norm)
Ejemplo n.º 9
0
    def test_scale_rows_and_cols(self):
        D = matrix([[1,0,0,2,3],
                    [0,4,0,5,0],
                    [0,0,6,7,0]])


        #TODO expose through function
        S = csr_matrix(D)
        v = array([1,2,3])
        csr_scale_rows(3,5,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), diag(v)*D )

        S = csr_matrix(D)
        v = array([1,2,3,4,5])
        csr_scale_columns(3,5,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), D*diag(v) )

        # blocks
        E = kron(D,[[1,2],[3,4]])
        S = bsr_matrix(E,blocksize=(2,2))
        v = array([1,2,3,4,5,6])
        bsr_scale_rows(3,5,2,2,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), diag(v)*E )

        S = bsr_matrix(E,blocksize=(2,2))
        v = array([1,2,3,4,5,6,7,8,9,10])
        bsr_scale_columns(3,5,2,2,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), E*diag(v) )

        E = kron(D,[[1,2,3],[4,5,6]])
        S = bsr_matrix(E,blocksize=(2,3))
        v = array([1,2,3,4,5,6])
        bsr_scale_rows(3,5,2,3,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), diag(v)*E )

        S = bsr_matrix(E,blocksize=(2,3))
        v = array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])
        bsr_scale_columns(3,5,2,3,S.indptr,S.indices,S.data,v)
        assert_equal(S.todense(), E*diag(v) )
    def build_rank(self, objects):

        self.__build_vocabulary(objects)

        n_terms = len(self.vocabulary)
        n_docs = len(objects)
        ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float))

        for index, indexable in enumerate(objects):
            for word in indexable.words_generator(self.stop_words):
                word_index_in_vocabulary = self.vocabulary[word]
                doc_word_count = indexable.count_for_word(word)
                ft_matrix[index, word_index_in_vocabulary] = doc_word_count
        self.ft_matrix = ft_matrix.tocsc()

        logger.info(
            'Results will be displayed from higher to lower ranking...')

        df = np.diff(self.ft_matrix.indptr) + self.smoothing
        n_docs_smooth = n_docs + self.smoothing

        idf = np.log(float(n_docs_smooth) / df) + 1.0
        self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms)

        self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix
        self.tf_idf_matrix = self.tf_idf_matrix.tocsr()

        norm = self.tf_idf_matrix.tocsr(copy=True)
        norm.data **= 2
        norm = norm.sum(axis=1)
        n_nzeros = np.where(norm > 0)
        norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
        norm = np.array(norm).T[0]
        sptools.csr_scale_rows(self.tf_idf_matrix.shape[0],
                               self.tf_idf_matrix.shape[1],
                               self.tf_idf_matrix.indptr,
                               self.tf_idf_matrix.indices,
                               self.tf_idf_matrix.data, norm)
Ejemplo n.º 11
0
def csr_scale_rows(A, x):
    sparsetools.csr_scale_rows(A.shape[0], A.shape[1], A.indptr, A.indices,
                               A.data, x)
Ejemplo n.º 12
0
def normalized_entropy():
    transition_matrix = pickle.load( open( SSD_HOME+"pickle/transition_matrix", "rb" ) )
    print "loaded transitions"
    graph = pickle.load( open( SSD_HOME+"pickle/graph", "rb" ) )
    print "loaded graph"
    values = pickle.load( open( SSD_HOME+"pickle/values", "rb" ) )


    vocab = pickle.load( open( SSD_HOME+"pickle/vocab", "rb" ) )
    print "loaded vocab"

    state_count = len(vocab)
    states = vocab.keys()
    shape = (state_count, state_count)



    # structural hypothesises
    hyp_structural = csr_matrix((values, (graph[0], graph[1])),
                                shape=shape, dtype=np.float)



    transitions = csr_matrix((transition_matrix[2], (transition_matrix[0], transition_matrix[1])),
                             shape=shape)

    del transition_matrix

    #delete all zero rows from all  see  http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero
    print transitions.shape
    nonzero_row_indice, _ = transitions.nonzero()
    unique_nonzero_indice = np.unique(nonzero_row_indice)
    transitions = transitions[unique_nonzero_indice]
    print transitions.shape

    hyp_data = csr_matrix(transitions, copy=True)
    print hyp_data.shape

    hyp_structural = hyp_structural[unique_nonzero_indice]
    #norm the data
    norm_h = hyp_data.sum(axis=1)
    n_nzeros = np.where(norm_h > 0)
    norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
    norm_h = np.array(norm_h).T[0]

    csr_scale_rows(hyp_data.shape[0],
                   hyp_data.shape[1],
                   hyp_data.indptr,
                   hyp_data.indices,
                   hyp_data.data, norm_h)

    #calculate the entropy for a row
    #entropy = np.apply_along_axis( entropy_step, axis=1, arr=hyp_data )
    entropy =[]
    c=0
    for i in range(0,hyp_data.shape[0]):
        c+=1
        if c % 100000 == 0:
            print c
        x = hyp_data.getrow(i)
        entropy.append(entropy_step(x))

    print "entropy"
    #number of link for a row, needed for normalization fo the
    norm_h = hyp_structural.sum(axis=1)
    #print norm_h
    normalized_entropy = []
    for i, x in enumerate(entropy):
        if i % 100000 == 0:
            print i

        if math.log(norm_h[i][0])==0:
            normalized_entropy.append(0)
        else:
            e = x/math.log(norm_h[i][0])
            normalized_entropy.append(e)

    print "normed entropy"
    write_pickle('output/normalized_entropy.obj',normalized_entropy)
Ejemplo n.º 13
0
def gini_random_rows(row_normed):
    print "gini"
    print row_normed
    transition_matrix = pickle.load( open( SSD_HOME+"pickle/transition_matrix", "rb" ) )
    print "loaded transitions"
    graph = pickle.load( open( SSD_HOME+"pickle/graph", "rb" ) )
    print "loaded graph"
    values = pickle.load( open( SSD_HOME+"pickle/values", "rb" ) )


    vocab = pickle.load( open( SSD_HOME+"pickle/vocab", "rb" ) )
    print "loaded vocab"

    state_count = len(vocab)
    states = vocab.keys()
    shape = (state_count, state_count)



    # structural hypothesises
    hyp_structural = csr_matrix((values, (graph[0], graph[1])),
                                shape=shape, dtype=np.float)



    transitions = csr_matrix((transition_matrix[2], (transition_matrix[0], transition_matrix[1])),
                             shape=shape)

    del transition_matrix

    #delete all zero rows from all  see  http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero
    print transitions.shape
    nonzero_row_indice, _ = transitions.nonzero()
    unique_nonzero_indice = np.unique(nonzero_row_indice)
    transitions = transitions[unique_nonzero_indice]
    print transitions.shape

    hyp_data = csr_matrix(transitions, copy=True)
    print hyp_data.shape

    hyp_structural = hyp_structural[unique_nonzero_indice]
    if row_normed:
        #norm the data
        norm_h = hyp_data.sum(axis=1)
        n_nzeros = np.where(norm_h > 0)
        norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
        norm_h = np.array(norm_h).T[0]

        csr_scale_rows(hyp_data.shape[0],
                       hyp_data.shape[1],
                       hyp_data.indptr,
                       hyp_data.indices,
                       hyp_data.data, norm_h)

    #calculate the gini for a row

    gini =[]
    c=0
    #for i in range(0,hyp_data.shape[0]):

    import random
    for i in random.sample(range(0,hyp_data.shape[0]), 10000):
        c+=1
        if c % 1000 == 0:
            print c
        counts = hyp_data.getrow(i).toarray()[0]
        links = hyp_structural.getrow(i).toarray()[0]
        indices_of_links =  links > 0
        gini_data = counts[indices_of_links]
        gini.append(gini_step(gini_data))

    print "gini"
    if row_normed:
        write_pickle('output/gini_random_rows_row_normed.obj',gini)
    else:
        write_pickle('output/gini_random_rows.obj',gini)
Ejemplo n.º 14
0
# f = open('X_indptr.p', 'wb')
# cPickle.dump(X.indptr, f, protocol=-1)

#X = normalize(X)

# compute the inverse of l2 norm of non-zero elements
X.data **= 2
norm = X.sum(axis=1)
n_nzeros = np.where(norm > 0)
norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
norm = np.array(norm).T[0]
X.data = np.sqrt(X.data)
# modify sparse_csc_matrix in place
sparsetools.csr_scale_rows(X.shape[0],
                              X.shape[1],
                              X.indptr,
                              X.indices,
                              X.data, norm)

print X.shape
print X[0,:].sum()

sparse_save(X,"../data/tfidf_norm.h5")

# f = open('X_norm_data.p', 'wb')
# cPickle.dump(X.data, f, protocol=-1)
# f = open('X__norm_indices.p', 'wb')
# cPickle.dump(X.indices, f, protocol=-1)
# f = open('X_norm_indptr.p', 'wb')
# cPickle.dump(X.indptr, f, protocol=-1)
Ejemplo n.º 15
0
def csr_scale_rows(A, x):
    sparsetools.csr_scale_rows(A.shape[0], A.shape[1],
                               A.indptr, A.indices, A.data,
                               x)
Ejemplo n.º 16
0
    def evidence(self, hypothesis, structur, k=1, prior=1., norm=True):
        """
        Determines Bayesian evidence given fitted model and hypothesis

        Args:
            hypothesis: Hypothesis csr matrix,
                        indices need to map those of transition matrix
            k: Concentration parameter k
            prior: proto Dirichlet prior
            norm: Flag for normalizing hypothesis matrix
        Returns
            evidence
        """

        # care with copy here
        hypothesis = csr_matrix(hypothesis, copy=True)

        structur = csr_matrix(structur, copy=True)

        pseudo_counts = k * self.state_count

        if hypothesis.size != 0:
            # in case of memory issues set copy to False but then care about changed hypothesis matrix
            if norm == True:
                #print "in norm"
                norm_h = hypothesis.sum(axis=1)
                n_nzeros = np.where(norm_h > 0)
                norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
                norm_h = np.array(norm_h).T[0]
                #print "in place mod"
                # modify sparse_csc_matrix in place
                csr_scale_rows(hypothesis.shape[0], hypothesis.shape[1],
                               hypothesis.indptr, hypothesis.indices,
                               hypothesis.data, norm_h)

                # distribute pseudo counts to matrix, row-based approach
                hypothesis = hypothesis * pseudo_counts
                #print "after pseude counts"
                # also consider those rows which only include zeros
                norma = hypothesis.sum(axis=1)
                n_zeros, _ = np.where(norma == 0)
                hypothesis[n_zeros, :] = pseudo_counts / self.state_count
            else:
                #print "in norm"
                norm_h = hypothesis.sum(axis=1)
                n_nzeros = np.where(norm_h > 0)
                norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
                norm_h = np.array(norm_h).T[0]
                #print "in place mod"
                # modify sparse_csc_matrix in place
                csr_scale_rows(hypothesis.shape[0], hypothesis.shape[1],
                               hypothesis.indptr, hypothesis.indices,
                               hypothesis.data, norm_h)

                # distribute pseudo counts to matrix, row-based approach
                #TODO check if this line should be placed after the zero_rows_norm() call????
                hypothesis = hypothesis * pseudo_counts

                #self.zero_rows_norm(hypothesis, structur,k)
                self.zero_rows_norm_eff1(hypothesis, structur, k)

        else:
            # if hypothesis matrix is empty, we can simply increase the proto prior parameter
            prior += k

        # transition matrix with additional Dirichlet prior
        # not memory efficient
        transitions_prior = self.transitions.copy()
        transitions_prior = transitions_prior + hypothesis
        #print "after copy"
        # elegantly calculate evidence
        evidence = 0
        evidence += gammaln(hypothesis.sum(axis=1) +
                            self.state_count * prior).sum()
        evidence -= gammaln(
            self.transitions.sum(axis=1) + hypothesis.sum(axis=1) +
            self.state_count * prior).sum()
        evidence += gammaln(transitions_prior.data + prior).sum()
        evidence -= gammaln(hypothesis.data + prior).sum() + (len(
            transitions_prior.data) - len(hypothesis.data)) * gammaln(prior)
        return evidence
Ejemplo n.º 17
0
def normalized_entropy():
    transition_matrix = pickle.load(
        open(SSD_HOME + "pickle/transition_matrix", "rb"))
    print "loaded transitions"
    graph = pickle.load(open(SSD_HOME + "pickle/graph", "rb"))
    print "loaded graph"
    values = pickle.load(open(SSD_HOME + "pickle/values", "rb"))

    vocab = pickle.load(open(SSD_HOME + "pickle/vocab", "rb"))
    print "loaded vocab"

    state_count = len(vocab)
    states = vocab.keys()
    shape = (state_count, state_count)

    # structural hypothesises
    hyp_structural = csr_matrix((values, (graph[0], graph[1])),
                                shape=shape,
                                dtype=np.float)

    transitions = csr_matrix(
        (transition_matrix[2], (transition_matrix[0], transition_matrix[1])),
        shape=shape)

    del transition_matrix

    #delete all zero rows from all  see  http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero
    print transitions.shape
    nonzero_row_indice, _ = transitions.nonzero()
    unique_nonzero_indice = np.unique(nonzero_row_indice)
    transitions = transitions[unique_nonzero_indice]
    print transitions.shape

    hyp_data = csr_matrix(transitions, copy=True)
    print hyp_data.shape

    hyp_structural = hyp_structural[unique_nonzero_indice]
    #norm the data
    norm_h = hyp_data.sum(axis=1)
    n_nzeros = np.where(norm_h > 0)
    norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
    norm_h = np.array(norm_h).T[0]

    csr_scale_rows(hyp_data.shape[0], hyp_data.shape[1], hyp_data.indptr,
                   hyp_data.indices, hyp_data.data, norm_h)

    #calculate the entropy for a row
    #entropy = np.apply_along_axis( entropy_step, axis=1, arr=hyp_data )
    entropy = []
    c = 0
    for i in range(0, hyp_data.shape[0]):
        c += 1
        if c % 100000 == 0:
            print c
        x = hyp_data.getrow(i)
        entropy.append(entropy_step(x))

    print "entropy"
    #number of link for a row, needed for normalization fo the
    norm_h = hyp_structural.sum(axis=1)
    #print norm_h
    normalized_entropy = []
    for i, x in enumerate(entropy):
        if i % 100000 == 0:
            print i

        if math.log(norm_h[i][0]) == 0:
            normalized_entropy.append(0)
        else:
            e = x / math.log(norm_h[i][0])
            normalized_entropy.append(e)

    print "normed entropy"
    write_pickle('output/normalized_entropy.obj', normalized_entropy)
Ejemplo n.º 18
0
def gini_random_rows(row_normed):
    print "gini"
    print row_normed
    transition_matrix = pickle.load(
        open(SSD_HOME + "pickle/transition_matrix", "rb"))
    print "loaded transitions"
    graph = pickle.load(open(SSD_HOME + "pickle/graph", "rb"))
    print "loaded graph"
    values = pickle.load(open(SSD_HOME + "pickle/values", "rb"))

    vocab = pickle.load(open(SSD_HOME + "pickle/vocab", "rb"))
    print "loaded vocab"

    state_count = len(vocab)
    states = vocab.keys()
    shape = (state_count, state_count)

    # structural hypothesises
    hyp_structural = csr_matrix((values, (graph[0], graph[1])),
                                shape=shape,
                                dtype=np.float)

    transitions = csr_matrix(
        (transition_matrix[2], (transition_matrix[0], transition_matrix[1])),
        shape=shape)

    del transition_matrix

    #delete all zero rows from all  see  http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero
    print transitions.shape
    nonzero_row_indice, _ = transitions.nonzero()
    unique_nonzero_indice = np.unique(nonzero_row_indice)
    transitions = transitions[unique_nonzero_indice]
    print transitions.shape

    hyp_data = csr_matrix(transitions, copy=True)
    print hyp_data.shape

    hyp_structural = hyp_structural[unique_nonzero_indice]
    if row_normed:
        #norm the data
        norm_h = hyp_data.sum(axis=1)
        n_nzeros = np.where(norm_h > 0)
        norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
        norm_h = np.array(norm_h).T[0]

        csr_scale_rows(hyp_data.shape[0], hyp_data.shape[1], hyp_data.indptr,
                       hyp_data.indices, hyp_data.data, norm_h)

    #calculate the gini for a row

    gini = []
    c = 0
    #for i in range(0,hyp_data.shape[0]):

    import random
    for i in random.sample(range(0, hyp_data.shape[0]), 10000):
        c += 1
        if c % 1000 == 0:
            print c
        counts = hyp_data.getrow(i).toarray()[0]
        links = hyp_structural.getrow(i).toarray()[0]
        indices_of_links = links > 0
        gini_data = counts[indices_of_links]
        gini.append(gini_step(gini_data))

    print "gini"
    if row_normed:
        write_pickle('output/gini_random_rows_row_normed.obj', gini)
    else:
        write_pickle('output/gini_random_rows.obj', gini)
Ejemplo n.º 19
0
    def evidence(self, hypothesis, structur, k=1, prior=1., norm=True):
        """
        Determines Bayesian evidence given fitted model and hypothesis

        Args:
            hypothesis: Hypothesis csr matrix,
                        indices need to map those of transition matrix
            k: Concentration parameter k
            prior: proto Dirichlet prior
            norm: Flag for normalizing hypothesis matrix
        Returns
            evidence
        """

        # care with copy here
        hypothesis = csr_matrix(hypothesis, copy=True)

        structur = csr_matrix(structur, copy=True)

        pseudo_counts = k * self.state_count

        if hypothesis.size != 0:
            # in case of memory issues set copy to False but then care about changed hypothesis matrix
            if norm == True:
                #print "in norm"
                norm_h = hypothesis.sum(axis=1)
                n_nzeros = np.where(norm_h > 0)
                norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
                norm_h = np.array(norm_h).T[0]
                #print "in place mod"
                # modify sparse_csc_matrix in place
                csr_scale_rows(hypothesis.shape[0],
                                  hypothesis.shape[1],
                                  hypothesis.indptr,
                                  hypothesis.indices,
                                  hypothesis.data, norm_h)


                # distribute pseudo counts to matrix, row-based approach
                hypothesis = hypothesis * pseudo_counts
                #print "after pseude counts"
                # also consider those rows which only include zeros
                norma = hypothesis.sum(axis=1)
                n_zeros,_ = np.where(norma == 0)
                hypothesis[n_zeros,:] = pseudo_counts / self.state_count
            else:
                #print "in norm"
                norm_h = hypothesis.sum(axis=1)
                n_nzeros = np.where(norm_h > 0)
                norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros]
                norm_h = np.array(norm_h).T[0]
                #print "in place mod"
                # modify sparse_csc_matrix in place
                csr_scale_rows(hypothesis.shape[0],
                               hypothesis.shape[1],
                               hypothesis.indptr,
                               hypothesis.indices,
                               hypothesis.data, norm_h)


                # distribute pseudo counts to matrix, row-based approach
                #TODO check if this line should be placed after the zero_rows_norm() call????
                hypothesis = hypothesis * pseudo_counts

                #self.zero_rows_norm(hypothesis, structur,k)
                self.zero_rows_norm_eff1(hypothesis, structur, k)

        else:
            # if hypothesis matrix is empty, we can simply increase the proto prior parameter
            prior += k

        # transition matrix with additional Dirichlet prior
        # not memory efficient
        transitions_prior = self.transitions.copy()
        transitions_prior = transitions_prior + hypothesis
        #print "after copy"
        # elegantly calculate evidence
        evidence = 0
        evidence += gammaln(hypothesis.sum(axis=1)+self.state_count*prior).sum()
        evidence -= gammaln(self.transitions.sum(axis=1)+hypothesis.sum(axis=1)+self.state_count*prior).sum()
        evidence += gammaln(transitions_prior.data+prior).sum()
        evidence -= gammaln(hypothesis.data+prior).sum() + (len(transitions_prior.data)-len(hypothesis.data)) * gammaln(prior)
        return evidence
    def build_rank(self, objects,isBinaryWord=False):
        """Build tf-idf ranking score for terms in the corpus.

        Note:
          The code in this method could have been extracted to other smaller
          methods, improving legibility. This extraction has not been done so
          that its runtime complexity can be computed easily (the runtime
          complexity can be improved).

        Args:
          objects (list of Indexable): List of indexed objects that will be
            considered during tf-idf score computation.

        """
        self.__build_vocabulary(objects,isBinaryWord)

        n_terms = len(self.vocabulary)
        n_docs = len(objects)
        ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float))

        logger.info('Vocabulary assembled with terms count %s', n_terms)

        # compute idf
        logger.info('Starting tf computation...')
        for index, indexable in enumerate(objects):
            for word in indexable.words_generator(self.stop_words):
                word_index_in_vocabulary = self.vocabulary[word]
                doc_word_count = indexable.count_for_word(word)
                ft_matrix[index, word_index_in_vocabulary] = doc_word_count
        
        # Add synword's idf

        # for word in self.vocabulary_withoutsynword.keys():
        #     for synword in sy.synonymwords(word)[0:4]:
        #         word_index_in_vocabulary = self.vocabulary[word]
        #         synword_index_in_vocabulary = self.vocabulary[synword]
        #         if synword not in self.vocabulary_withoutsynword.keys():
        #             #print "origin word: ", word," synword: ",synword
        #             ft_matrix[:,synword_index_in_vocabulary] = ft_matrix[:,word_index_in_vocabulary]
        #         elif synword != word:
        #             newmatrix1 = 0.6*ft_matrix[:,word_index_in_vocabulary]+0.4*ft_matrix[:,synword_index_in_vocabulary]
        #             newmatrix2 = 0.4*ft_matrix[:,word_index_in_vocabulary]+0.6*ft_matrix[:,synword_index_in_vocabulary]
        #             ft_matrix[:,word_index_in_vocabulary] = newmatrix1
        #             ft_matrix[:,synword_index_in_vocabulary] = newmatrix2



        self.ft_matrix = ft_matrix.tocsc()

        logger.info('Starting tf-idf computation...')
        # compute idf with smoothing
        df = np.diff(self.ft_matrix.indptr) + self.smoothing
        n_docs_smooth = n_docs + self.smoothing

        # create diagonal matrix to be multiplied with ft
        idf = np.log(float(n_docs_smooth) / df) + 1.0
        self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms)

        # compute tf-idf
        self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix
        self.tf_idf_matrix = self.tf_idf_matrix.tocsr()

        # compute td-idf normalization
        norm = self.tf_idf_matrix.tocsr(copy=True)
        norm.data **= 2
        norm = norm.sum(axis=1)
        n_nzeros = np.where(norm > 0)
        norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
        norm = np.array(norm).T[0]
        sptools.csr_scale_rows(self.tf_idf_matrix.shape[0],
                                      self.tf_idf_matrix.shape[1],
                                      self.tf_idf_matrix.indptr,
                                      self.tf_idf_matrix.indices,
                                      self.tf_idf_matrix.data, norm)
Ejemplo n.º 21
0
# f = open('X_indices.p', 'wb')
# cPickle.dump(X.indices, f, protocol=-1)
# f = open('X_indptr.p', 'wb')
# cPickle.dump(X.indptr, f, protocol=-1)

#X = normalize(X)

# compute the inverse of l2 norm of non-zero elements
X.data **= 2
norm = X.sum(axis=1)
n_nzeros = np.where(norm > 0)
norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
norm = np.array(norm).T[0]
X.data = np.sqrt(X.data)
# modify sparse_csc_matrix in place
sparsetools.csr_scale_rows(X.shape[0], X.shape[1], X.indptr, X.indices, X.data,
                           norm)

print X.shape
print X[0, :].sum()

sparse_save(X, "../data/tfidf_norm.h5")

# f = open('X_norm_data.p', 'wb')
# cPickle.dump(X.data, f, protocol=-1)
# f = open('X__norm_indices.p', 'wb')
# cPickle.dump(X.indices, f, protocol=-1)
# f = open('X_norm_indptr.p', 'wb')
# cPickle.dump(X.indptr, f, protocol=-1)

#matrix = X.dot(X.T)
Ejemplo n.º 22
0
    def build_rank(self, objects, isBinaryWord=False):
        """Build tf-idf ranking score for terms in the corpus.

        Note:
          The code in this method could have been extracted to other smaller
          methods, improving legibility. This extraction has not been done so
          that its runtime complexity can be computed easily (the runtime
          complexity can be improved).

        Args:
          objects (list of Indexable): List of indexed objects that will be
            considered during tf-idf score computation.

        """
        self.__build_vocabulary(objects, isBinaryWord)

        n_terms = len(self.vocabulary)
        n_docs = len(objects)
        ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float))

        logger.info('Vocabulary assembled with terms count %s', n_terms)

        # compute idf
        logger.info('Starting tf computation...')
        for index, indexable in enumerate(objects):
            for word in indexable.words_generator(self.stop_words):
                word_index_in_vocabulary = self.vocabulary[word]
                doc_word_count = indexable.count_for_word(word)
                ft_matrix[index, word_index_in_vocabulary] = doc_word_count

        # Add synword's idf

        # for word in self.vocabulary_withoutsynword.keys():
        #     for synword in sy.synonymwords(word)[0:4]:
        #         word_index_in_vocabulary = self.vocabulary[word]
        #         synword_index_in_vocabulary = self.vocabulary[synword]
        #         if synword not in self.vocabulary_withoutsynword.keys():
        #             #print "origin word: ", word," synword: ",synword
        #             ft_matrix[:,synword_index_in_vocabulary] = ft_matrix[:,word_index_in_vocabulary]
        #         elif synword != word:
        #             newmatrix1 = 0.6*ft_matrix[:,word_index_in_vocabulary]+0.4*ft_matrix[:,synword_index_in_vocabulary]
        #             newmatrix2 = 0.4*ft_matrix[:,word_index_in_vocabulary]+0.6*ft_matrix[:,synword_index_in_vocabulary]
        #             ft_matrix[:,word_index_in_vocabulary] = newmatrix1
        #             ft_matrix[:,synword_index_in_vocabulary] = newmatrix2

        self.ft_matrix = ft_matrix.tocsc()

        logger.info('Starting tf-idf computation...')
        # compute idf with smoothing
        df = np.diff(self.ft_matrix.indptr) + self.smoothing
        n_docs_smooth = n_docs + self.smoothing

        # create diagonal matrix to be multiplied with ft
        idf = np.log(float(n_docs_smooth) / df) + 1.0
        self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms)

        # compute tf-idf
        self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix
        self.tf_idf_matrix = self.tf_idf_matrix.tocsr()

        # compute td-idf normalization
        norm = self.tf_idf_matrix.tocsr(copy=True)
        norm.data **= 2
        norm = norm.sum(axis=1)
        n_nzeros = np.where(norm > 0)
        norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros])
        norm = np.array(norm).T[0]
        sptools.csr_scale_rows(self.tf_idf_matrix.shape[0],
                               self.tf_idf_matrix.shape[1],
                               self.tf_idf_matrix.indptr,
                               self.tf_idf_matrix.indices,
                               self.tf_idf_matrix.data, norm)