Esempio n. 1
0
def fv_mean_var_vectors(info, runvars):
    # Load vocabulary and wordembedding vectors
    vocab_list = data.load_vocab_list(info)
    model = get_model(info)
    embedding_function = model.embedding_function
    vec_shape = model.vector_size
    
    # Sum to compute mean
    mean_vec = np.zeros(vec_shape)
    count = 0
    for token in vocab_list:
        try:
            mean_vec += embedding_function(token)
            count += 1
        except OOVException:
            continue
    if count > 0:
        mean_vec /= count
    runvars['mean_vec'] = mean_vec
    
    # Sum to compute variance
    var_vec = np.zeros(vec_shape)
    count = 0
    for token in vocab_list:
        try:
            var_vec += np.square(embedding_function(token) - mean_vec)
            count += 1
        except OOVException:
            continue
    if count > 0:
        var_vec /= count
    runvars['var_vec'] = np.maximum(0.001, var_vec)
Esempio n. 2
0
def make_cbow_mat_tf_idf(info, runvars):
    # Create tf-idf matrix
    make_term_doc_mat_tf_idf(info, runvars)
    tf_idf_mat = runvars['term_doc_mat_tf_idf']
    
    # Load vocabulary and wordembedding vectors
    vocab_list = data.load_vocab_list(info)
    model = get_model(info)
    embedding_function = model.embedding_function
    
    # Create a zero matrix
    cbow_tf_idf_shape = (model.vector_size, tf_idf_mat.shape[1])
    cbow_tf_idf = np.zeros(cbow_tf_idf_shape)
    
    # Iterate over all nonzero entries of the tf-idf matrix:
    nonzeros = zip(*sparse.find(tf_idf_mat))
    for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = tf_idf_mat.nnz, print_every = 5000):
        # Add each entry times the corresponding vector to the matrix
        try:
            cbow_tf_idf[:,doc_idx] = cbow_tf_idf[:,doc_idx] + value * embedding_function(vocab_list[token_idx])
        except OOVException:
            pass
        
    # Return the matrix
    runvars['cbow_mat'] = cbow_tf_idf
Esempio n. 3
0
    def _pre_algorithm(self):
        # Load the embeddings
        embedding_model = get_model(self.info)
        embeddings = embedding_model.get_embeddings()
        vector_size = embedding_model.vector_size()

        # Load the vocab
        vocab = data.load_vocab_list(self.info)

        # construct V
        v_shape = (vector_size, len(vocab))
        self.v_mat = np.zeros(v_shape)
        for idx, token in enumerate(vocab):
            try:
                self.v_mat[:, idx] = embeddings[token]
            except:
                pass

        # find elements in the nullspace of VTV
        if self.null is not None:
            nbprint('Finding {} elements in ker(VTV)'.format(self.num_kernel))
            self.kernelvectors = []
            for i in ProgressIterator(range(2 * self.num_kernel),
                                      print_every=1):
                op = LinearOperator(
                    (len(vocab), len(vocab)),
                    matvec=lambda x: self.v_mat.transpose() @ (self.v_mat @ x))
                try:
                    w, v = eigs(op, k=1, which='SM', maxiter=100)
                    w = np.real(w[0])
                    v = np.real(v[:, 0])
                    if w < 1e-10:
                        v = v / np.sqrt(np.sum(np.square(v)))
                        self.kernelvectors.append(v)
                        if len(self.kernelvectors) >= self.num_kernel:
                            break
                except ArpackNoConvergence:
                    nbprint('eigs did not converge')
            self.v_sums = [np.sum(v) for v in self.kernelvectors]

        # Initialize W and H from NMF
        nbprint('Initial NMF')
        nmf_model = NMFSklearn(self.num_topics, init='nndsvd')
        self.W = np.maximum(nmf_model.fit_transform(self.input_mat), self.eps)
        self.H = np.maximum(nmf_model.components_, self.eps)
Esempio n. 4
0
def make_cbow_mat_minmaxmean(info, runvars):
    # Get count matrix
    count_mat = runvars['term_doc_mat_count']
    
    # Load vocabulary and wordembedding vectors
    vocab_list = data.load_vocab_list(info)
    model = get_model(info)
    embedding_function = model.embedding_function
    
    # Create a zero matrix
    cbow_m_shape = (model.vector_size, count_mat.shape[1])
    cbow_min = np.full(cbow_m_shape, np.inf)
    cbow_max = np.full(cbow_m_shape, -np.inf)
    cbow_mean = np.zeros(cbow_m_shape)
    column_sum = np.zeros(count_mat.shape[1])
    
    # Iterate over all nonzero entries of the count matrix:
    nonzeros = zip(*sparse.find(count_mat))
    for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000):
        try:
            embedding_vector = embedding_function(vocab_list[token_idx])
        except OOVException:
            continue
        # Entry wise minimum with the embedding vector
        cbow_min[:,doc_idx] = np.minimum(cbow_min[:,doc_idx], embedding_vector)
        # Entry wise maximum with the embedding vector
        cbow_max[:,doc_idx] = np.maximum(cbow_max[:,doc_idx], embedding_vector)
        # Sum up all embedding vectors and the total number of tokens in the document
        cbow_mean[:,doc_idx] = cbow_mean[:,doc_idx] + value * embedding_vector
        column_sum[doc_idx] = column_sum[doc_idx] + value
        
    # Divide sum by number of tokens
    cbow_mean = cbow_mean * sparse.diags(1/np.maximum(1, column_sum))
    
    # Stack all matrices and return
    cbow_mat = np.vstack((cbow_min,cbow_max,cbow_mean))
    cbow_mat[np.invert(np.isfinite(cbow_mat))] = 0
    runvars['cbow_mat'] = cbow_mat
Esempio n. 5
0
def fv_build_mat(info, runvars):    
    # Get matrices
    count_mat = runvars['term_doc_mat_count']
    mean_vec = runvars['mean_vec']
    var_vec = runvars['var_vec']
    
    # Load vocabulary and wordembedding vectors
    vocab_list = data.load_vocab_list(info)
    model = get_model(info)
    embedding_function = model.embedding_function
    
    # Create a zero matrix
    dimension = model.vector_size
    fv_m_shape = (dimension*2, count_mat.shape[1])
    fv_mat = np.zeros(fv_m_shape)
    fv_num_tokens_shape = (1, count_mat.shape[1])
    fv_num_tokens = np.zeros(fv_num_tokens_shape)
    
    # iterate all nonzero entries
    nonzeros = zip(*sparse.find(count_mat))
    for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000):
        try:
            embedding_vector = embedding_function(vocab_list[token_idx])
        except OOVException:
            continue
        fv_mat[:dimension, doc_idx] += value * (embedding_vector - mean_vec) / var_vec
        fv_mat[dimension:, doc_idx] += value * (np.square(embedding_vector - mean_vec) / (var_vec * np.sqrt(var_vec)) - (1 / np.sqrt(var_vec)))
        fv_num_tokens[0,doc_idx] += value
        
    # normalize
    fv_num_tokens[fv_num_tokens == 0] = 1
    fv_mat *= np.power(fv_num_tokens, -0.5)
    fv_mat[:dimension,:] = (fv_mat[:dimension,:].transpose() * np.nan_to_num(np.power(1 / var_vec, -0.5))).transpose()
    fv_mat[dimension:,:] = (fv_mat[dimension:,:].transpose() * np.nan_to_num(np.power(2 / var_vec, -0.5))).transpose()
        
    runvars['cbow_mat'] = fv_mat
Esempio n. 6
0
def make_phrase_mat(info, runvars):
    model = get_model(info)
    embedding_function = model.embedding_function
    batch = []
    batchsize = 0
    min_batchsize = 4096
    current_idx = 0

    # Count documents
    num_documents = 0
    with data.document_reader(info) as documents:
        for document in ProgressIterator(documents, 'Counting Documents'):
            num_documents += 1

    # Create a zero matrix
    phrase_mat_shape = (model.vector_size, num_documents)
    phrase_mat = np.zeros(phrase_mat_shape)

    with data.document_reader(info) as documents:
        progress_iterator = ProgressIterator(documents,
                                             'Vectorizing Documents')
        for document in progress_iterator:
            batch.append(document['text'])
            batchsize += 1

            if batchsize >= min_batchsize:
                phrase_mat[:, current_idx:current_idx +
                           batchsize] = embedding_function(batch)
                current_idx += batchsize
                batchsize = 0
                batch = []

        if batchsize > 0:
            phrase_mat[:, current_idx:current_idx +
                       batchsize] = embedding_function(batch)
    runvars['phrase_mat'] = phrase_mat
Esempio n. 7
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.embedding_model = get_model(self.info)
     self.filter = self.embedding_model.filter.filter