Beispiel #1
0
def make_cbow_mat_tf_idf(info, runvars):
    # Create tf-idf matrix
    make_term_doc_mat_tf_idf(info, runvars)
    tf_idf_mat = runvars['term_doc_mat_tf_idf']
    
    # Load vocabulary and wordembedding vectors
    vocab_list = data.load_vocab_list(info)
    model = get_model(info)
    embedding_function = model.embedding_function
    
    # Create a zero matrix
    cbow_tf_idf_shape = (model.vector_size, tf_idf_mat.shape[1])
    cbow_tf_idf = np.zeros(cbow_tf_idf_shape)
    
    # Iterate over all nonzero entries of the tf-idf matrix:
    nonzeros = zip(*sparse.find(tf_idf_mat))
    for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = tf_idf_mat.nnz, print_every = 5000):
        # Add each entry times the corresponding vector to the matrix
        try:
            cbow_tf_idf[:,doc_idx] = cbow_tf_idf[:,doc_idx] + value * embedding_function(vocab_list[token_idx])
        except OOVException:
            pass
        
    # Return the matrix
    runvars['cbow_mat'] = cbow_tf_idf
Beispiel #2
0
def fv_mean_var_vectors(info, runvars):
    # Load vocabulary and wordembedding vectors
    vocab_list = data.load_vocab_list(info)
    model = get_model(info)
    embedding_function = model.embedding_function
    vec_shape = model.vector_size
    
    # Sum to compute mean
    mean_vec = np.zeros(vec_shape)
    count = 0
    for token in vocab_list:
        try:
            mean_vec += embedding_function(token)
            count += 1
        except OOVException:
            continue
    if count > 0:
        mean_vec /= count
    runvars['mean_vec'] = mean_vec
    
    # Sum to compute variance
    var_vec = np.zeros(vec_shape)
    count = 0
    for token in vocab_list:
        try:
            var_vec += np.square(embedding_function(token) - mean_vec)
            count += 1
        except OOVException:
            continue
    if count > 0:
        var_vec /= count
    runvars['var_vec'] = np.maximum(0.001, var_vec)
Beispiel #3
0
 def run(self, info):
     num_tokens = config.distiller['num_tokens']
     w_mat = data.load_w_mat(info)
     vocab = data.load_vocab_list(info)
     sorted_idcs = np.argsort(w_mat, axis=0)
     topiclist = []
     for col in range(w_mat.shape[1]):
         topic = []
         for idx in sorted_idcs[-num_tokens:, col][::-1]:
             topic.append(
                 TopicEntry(idx=int(idx),
                            weight=float(w_mat[idx, col]),
                            token=vocab[idx]))
         topiclist.append(topic)
     self.topic_token_version = info['token_version']
     self.topiclist = topiclist
Beispiel #4
0
    def _pre_algorithm(self):
        # Load the embeddings
        embedding_model = get_model(self.info)
        embeddings = embedding_model.get_embeddings()
        vector_size = embedding_model.vector_size()

        # Load the vocab
        vocab = data.load_vocab_list(self.info)

        # construct V
        v_shape = (vector_size, len(vocab))
        self.v_mat = np.zeros(v_shape)
        for idx, token in enumerate(vocab):
            try:
                self.v_mat[:, idx] = embeddings[token]
            except:
                pass

        # find elements in the nullspace of VTV
        if self.null is not None:
            nbprint('Finding {} elements in ker(VTV)'.format(self.num_kernel))
            self.kernelvectors = []
            for i in ProgressIterator(range(2 * self.num_kernel),
                                      print_every=1):
                op = LinearOperator(
                    (len(vocab), len(vocab)),
                    matvec=lambda x: self.v_mat.transpose() @ (self.v_mat @ x))
                try:
                    w, v = eigs(op, k=1, which='SM', maxiter=100)
                    w = np.real(w[0])
                    v = np.real(v[:, 0])
                    if w < 1e-10:
                        v = v / np.sqrt(np.sum(np.square(v)))
                        self.kernelvectors.append(v)
                        if len(self.kernelvectors) >= self.num_kernel:
                            break
                except ArpackNoConvergence:
                    nbprint('eigs did not converge')
            self.v_sums = [np.sum(v) for v in self.kernelvectors]

        # Initialize W and H from NMF
        nbprint('Initial NMF')
        nmf_model = NMFSklearn(self.num_topics, init='nndsvd')
        self.W = np.maximum(nmf_model.fit_transform(self.input_mat), self.eps)
        self.H = np.maximum(nmf_model.components_, self.eps)
Beispiel #5
0
def make_cbow_mat_minmaxmean(info, runvars):
    # Get count matrix
    count_mat = runvars['term_doc_mat_count']
    
    # Load vocabulary and wordembedding vectors
    vocab_list = data.load_vocab_list(info)
    model = get_model(info)
    embedding_function = model.embedding_function
    
    # Create a zero matrix
    cbow_m_shape = (model.vector_size, count_mat.shape[1])
    cbow_min = np.full(cbow_m_shape, np.inf)
    cbow_max = np.full(cbow_m_shape, -np.inf)
    cbow_mean = np.zeros(cbow_m_shape)
    column_sum = np.zeros(count_mat.shape[1])
    
    # Iterate over all nonzero entries of the count matrix:
    nonzeros = zip(*sparse.find(count_mat))
    for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000):
        try:
            embedding_vector = embedding_function(vocab_list[token_idx])
        except OOVException:
            continue
        # Entry wise minimum with the embedding vector
        cbow_min[:,doc_idx] = np.minimum(cbow_min[:,doc_idx], embedding_vector)
        # Entry wise maximum with the embedding vector
        cbow_max[:,doc_idx] = np.maximum(cbow_max[:,doc_idx], embedding_vector)
        # Sum up all embedding vectors and the total number of tokens in the document
        cbow_mean[:,doc_idx] = cbow_mean[:,doc_idx] + value * embedding_vector
        column_sum[doc_idx] = column_sum[doc_idx] + value
        
    # Divide sum by number of tokens
    cbow_mean = cbow_mean * sparse.diags(1/np.maximum(1, column_sum))
    
    # Stack all matrices and return
    cbow_mat = np.vstack((cbow_min,cbow_max,cbow_mean))
    cbow_mat[np.invert(np.isfinite(cbow_mat))] = 0
    runvars['cbow_mat'] = cbow_mat
Beispiel #6
0
def fv_build_mat(info, runvars):    
    # Get matrices
    count_mat = runvars['term_doc_mat_count']
    mean_vec = runvars['mean_vec']
    var_vec = runvars['var_vec']
    
    # Load vocabulary and wordembedding vectors
    vocab_list = data.load_vocab_list(info)
    model = get_model(info)
    embedding_function = model.embedding_function
    
    # Create a zero matrix
    dimension = model.vector_size
    fv_m_shape = (dimension*2, count_mat.shape[1])
    fv_mat = np.zeros(fv_m_shape)
    fv_num_tokens_shape = (1, count_mat.shape[1])
    fv_num_tokens = np.zeros(fv_num_tokens_shape)
    
    # iterate all nonzero entries
    nonzeros = zip(*sparse.find(count_mat))
    for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000):
        try:
            embedding_vector = embedding_function(vocab_list[token_idx])
        except OOVException:
            continue
        fv_mat[:dimension, doc_idx] += value * (embedding_vector - mean_vec) / var_vec
        fv_mat[dimension:, doc_idx] += value * (np.square(embedding_vector - mean_vec) / (var_vec * np.sqrt(var_vec)) - (1 / np.sqrt(var_vec)))
        fv_num_tokens[0,doc_idx] += value
        
    # normalize
    fv_num_tokens[fv_num_tokens == 0] = 1
    fv_mat *= np.power(fv_num_tokens, -0.5)
    fv_mat[:dimension,:] = (fv_mat[:dimension,:].transpose() * np.nan_to_num(np.power(1 / var_vec, -0.5))).transpose()
    fv_mat[dimension:,:] = (fv_mat[dimension:,:].transpose() * np.nan_to_num(np.power(2 / var_vec, -0.5))).transpose()
        
    runvars['cbow_mat'] = fv_mat
Beispiel #7
0
    def run(self, info):
        c_vec = load_c_vec(info)
        if c_vec is None:
            return
        second_info = info['second_info']
        num_tokens = config.distiller['num_tokens']
        num_topics = info['num_topics']
        vocab = data.load_vocab_list(second_info)
        input_mat = data.load_input_mat(second_info)
        c_vec_ids = load_mat_ids(info)
        input_mat_ids = data.load_mat_ids(second_info)

        common_ids = [i for i in c_vec_ids if i in input_mat_ids]
        filter_c_vec = [
            idx for idx, docid in enumerate(c_vec_ids) if docid in common_ids
        ]
        c_vec = c_vec[filter_c_vec]
        filter_input_mat = [
            idx for idx, docid in enumerate(input_mat_ids)
            if docid in common_ids
        ]
        input_mat = input_mat[:, filter_input_mat]

        topiclist = []
        for topic_idx in range(num_topics):
            topic = []
            target_vector = (c_vec == topic_idx).astype(int)
            mi = mutual_info_classif(input_mat.transpose(), target_vector)
            sorted_idcs = np.argsort(mi)
            for idx in sorted_idcs[-num_tokens:][::-1]:
                topic.append(
                    TopicEntry(idx=int(idx), weight=mi[idx], token=vocab[idx]))
            topiclist.append(topic)

        self.topic_token_version = second_info['token_version']
        self.topiclist = topiclist
Beispiel #8
0
    def run(self, info):
        h_mat = load_h_mat(info)
        if h_mat is None:
            return
        second_info = info['second_info']
        num_tokens = config.distiller['num_tokens']
        num_topics = info['num_topics']
        vocab = data.load_vocab_list(second_info)
        input_mat = data.load_input_mat(second_info)
        h_mat_ids = load_mat_ids(info)
        input_mat_ids = data.load_mat_ids(second_info)

        #common_ids = [i for i in h_mat_ids if i in input_mat_ids]
        common_ids = {}
        input_mat_ids2 = input_mat_ids.copy()
        for i in h_mat_ids:
            try:
                while input_mat_ids2[0] < i:
                    input_mat_ids2 = input_mat_ids2[1:]
                if input_mat_ids2[0] == i:
                    input_mat_ids2 = input_mat_ids2[1:]
                    common_ids[i] = True
            except IndexError:
                break
        filter_h_mat = [
            idx for idx, docid in enumerate(h_mat_ids) if docid in common_ids
        ]
        h_mat = h_mat[:, filter_h_mat]
        filter_input_mat = [
            idx for idx, docid in enumerate(input_mat_ids)
            if docid in common_ids
        ]
        input_mat = input_mat[:, filter_input_mat]

        eps = 1e-16
        threshold = 1e-16
        Ht = (h_mat / np.maximum(np.sum(h_mat, 0), 1e-16)).T
        W = input_mat @ Ht
        W = W / np.maximum(np.sum(W, 0), eps)
        for iteration in range(100):
            HHT = np.dot(Ht.T, Ht)
            W_old = np.copy(W)
            for r in range(num_topics):
                hr = Ht[:, r]
                idx = [i for i in range(num_topics) if i != r]
                wr = 1 / HHT[r, r] * (input_mat @ hr - W[:, idx] @ HHT[idx, r])
                W[:, r] = np.maximum(wr, eps).T
            mean_w_change = np.mean(np.abs((W - W_old) / W_old))
            if mean_w_change < threshold:
                nbprint(
                    'Converged after {} iterations. (threshold = {})'.format(
                        iteration + 1, threshold))
                break
        for r in range(num_topics):
            W[:, r] /= np.sqrt(np.sum(np.square(W[:, r])))
        mean_topic = np.mean(W, axis=1)
        mean_topic /= np.sqrt(np.sum(np.square(mean_topic)))
        for r in range(num_topics):
            W[:, r] = W[:, r] - np.sum(W[:, r] * mean_topic) * mean_topic

        num_tokens = config.distiller['num_tokens']
        sorted_idcs = np.argsort(W, axis=0)
        topiclist = []
        for col in range(W.shape[1]):
            topic = []
            for idx in sorted_idcs[-num_tokens:, col][::-1]:
                topic.append(
                    TopicEntry(idx=int(idx),
                               weight=W[idx, col],
                               token=vocab[idx]))
            topiclist.append(topic)
        self.topic_token_version = second_info['token_version']
        self.topiclist = topiclist