Python SymbolTable.get Examples

Programming Language: Python

Namespace/Package Name: utils

Class/Type: SymbolTable

Method/Function: get

Examples at hotexamples.com: 3

Python SymbolTable.get - 3 examples found. These are the top rated real world Python examples of utils.SymbolTable.get extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SymbolTable(17)

get(3)

num_symbols(2)

label(1)

len(1)

num_words(1)

reverse(1)

Example #1

Show file

File: vm_models.py Project: sidbrahma/shalo

class LinearModel(SHALOModelVectorMean, SHALOModelFixed):
    """Linear model over pretrained embeddings"""

    name = 'LinearModel'

    def _preprocess_data(self, sentence_data, init=True):
        # Initialize word table and populate with embeddings
        if init:
            self.word_dict = SymbolTable()
            for word in self.embedding_words:
                self.word_dict.get(word)
        # Process data
        return [
            map_words_to_symbols(s, self.word_dict.lookup, self.ngrams)
            for s in sentence_data
        ]

Example #2

Show file

File: shalo_base.py Project: sidbrahma/shalo

class SHALOModelPreTrain(SHALOModel):

    name = 'SHALOModelPreTrain'

    def __init__(self, embedding_file, save_file=None, n_threads=None):
        SHALOModel.__init__(self, save_file, n_threads)
        with open(embedding_file, 'rb') as f:
            self.embedding_words, self.embeddings = cPickle.load(f)

    def _word_table_init(self, training_sentences):
        """Get training words and init word table with pre-embedded words"""
        self._get_training_words(training_sentences)
        self.word_dict = SymbolTable()
        for word in self.embedding_words_train:
            self.word_dict.get(word)

    def _get_training_words(self, training_sentences):
        """Get training words and subset of pre-embedded words in train set"""
        unique_words = set(w for s in training_sentences for w in s)
        embedding_idxs_train, self.embedding_words_train = [], []
        for i, word in enumerate(self.embedding_words):
            if word in unique_words:
                self.embedding_words_train.append(word)
                embedding_idxs_train.append(i)
        idxs = np.ravel(embedding_idxs_train)
        self.embeddings_train = self.embeddings[idxs, :]

    def _get_embedding(self):
        """
        Return embedding tensor (either constant or variable)
        Row 0 is 0 vector for no token
        Row 1 is random initialization for UNKNOWN
        Rows 2 : 2 + len(self.embedding_words) are pretrained initialization
        Remaining rows are random initialization
        """
        zero = tf.constant(0.0, dtype=tf.float32, shape=(1, self.d))
        s = self.seed - 1
        unk = tf.Variable(tf.random_normal((1, self.d), stddev=SD, seed=s))
        pretrain = tf.Variable(self.embeddings_train, dtype=tf.float32)
        vecs = [zero, unk, pretrain]
        n_r = self.word_dict.num_words() - len(self.embedding_words_train)
        if n_r > 0:
            r = tf.Variable(tf.random_normal((n_r, self.d), stddev=SD, seed=s))
            vecs.append(r)
        self.U = tf.concat(vecs, axis=0, name='embedding_matrix')
        return self.U

Example #3

Show file

class TTBB(SHALOModelFixed):
    """Implementation of A Simple but Tough-to-Beat Baseline for Sent. Embedding
    In the basic model, the common component vector is computed before all
    computations. The embeddings are static, so no updates are made.
    """

    name = 'TTBB'

    def __init__(self,
                 embedding_file,
                 word_freq_file,
                 save_file=None,
                 n_threads=None):
        SHALOModelFixed.__init__(self, embedding_file, save_file, n_threads)
        # Get marginals file
        with open(word_freq_file, 'rb') as f:
            self.word_freq = cPickle.load(f)

    def _word_table_init(self, training_sentences):
        self.word_dict = SymbolTable()
        for word in self.embedding_words:
            self.word_dict.get(word)

    def _get_mapper(self, init):
        return self.word_dict.lookup

    def _preprocess_data(self, sentence_data, init=True):
        # Initialize word table and populate with embeddings
        if init:
            self._word_table_init(sentence_data)
        # Process data
        # Map tokens and return if not initializing
        mapper = self._get_mapper(init)
        tokens = [
            np.ravel(map_words_to_symbols(s, mapper, self.ngrams))
            for s in sentence_data
        ]
        self.train_tokens = tokens
        if not init:
            return tokens
        # If initializing, get marginal estimates
        self.marginals = np.zeros(self.word_dict.num_symbols())
        for word, idx in self.word_dict.d.iteritems():
            # Try getting word frequency directly
            if word in self.word_freq:
                self.marginals[idx] = self.word_freq[word]
            # Otherwise, try getting minimum frequency among sub-grams
            split_grams = word.split(GRAMSEP)
            if len(split_grams) > 1:
                min_freq = min(self.word_freq.get(w, 0.0) for w in split_grams)
                self.marginals[idx] = min_freq
        # Get initial smoother value
        self.a = self.train_kwargs.get('a', -3.0)
        return tokens

    def _compute_train_common_component(self, init=False):
        if init:
            self.session.run(tf.global_variables_initializer())
        x_array, x_len = self._get_data_batch(self.train_tokens)
        self.ccx = self.session.run(self.tf_ccx, {
            self.input: x_array,
            self.input_lengths: x_len
        })
        return self.ccx

    def _get_a_exp(self):
        return tf.constant(self.a, dtype=tf.float32)

    def _get_common_component(self):
        self.ccx = self._compute_train_common_component(init=True)
        return tf.constant(self.ccx, dtype=tf.float32)

    def _embed_sentences(self):
        """Tensorflow implementation of Simple but Tough-to-Beat Baseline"""
        # Get word features
        word_embeddings = self._get_embedding()
        word_feats = tf.nn.embedding_lookup(word_embeddings, self.input)
        # Get marginal estimates and scaling term
        batch_size = tf.shape(word_feats)[0]
        a = tf.pow(10.0, self._get_a_exp())
        p = tf.constant(self.marginals, dtype=tf.float32, name='marginals')
        q = tf.reshape(a / (a + tf.nn.embedding_lookup(p, self.input)),
                       (batch_size, self.mx_len, 1))
        # Compute initial sentence embedding
        z = tf.reshape(1.0 / tf.to_float(self.input_lengths), (batch_size, 1))
        S = z * tf.reduce_sum(q * word_feats, axis=1)
        # Compute common component
        S_centered = S - tf.reduce_mean(S, axis=0)
        _, _, V = tf.svd(S_centered, full_matrices=False, compute_uv=True)
        self.tf_ccx = tf.stop_gradient(tf.gather(tf.transpose(V), 0))
        # Common component removal
        ccx = tf.reshape(self._get_common_component(), (1, self.d))
        sv = {'embeddings': word_embeddings, 'a': a, 'p': p, 'ccx': ccx}
        return S - tf.matmul(S, ccx * tf.transpose(ccx)), sv