Ejemplo n.º 1
0
    def _init(self, loader: ResourceLoader, voc: Iterable[str]):
        # TODO we should not be building variables here
        if voc is not None:
            word_to_vec = loader.load_word_vec(self.vec_name, voc)
        else:
            word_to_vec = loader.load_word_vec(self.vec_name)
            voc = set(word_to_vec.keys())

        self._word_to_ix = {}

        dim = next(iter(word_to_vec.values())).shape[0]

        null_embed = tf.zeros((1, dim), dtype=tf.float32)
        unk_embed = tf.get_variable(shape=(1, dim),
                                    name="unk_embed",
                                    dtype=np.float32,
                                    trainable=self.learn_unk,
                                    initializer=tf.random_uniform_initializer(
                                        -self.word_vec_init_scale,
                                        self.word_vec_init_scale))
        ix = 2
        matrix_list = [null_embed, unk_embed]

        if self._special_tokens is not None and len(self._special_tokens) > 0:
            print("Building embeddings for %d special_tokens" %
                  (len(self._special_tokens)))
            tok_embed = tf.get_variable(
                shape=(len(self._special_tokens), dim),
                name="token_embed",
                dtype=np.float32,
                trainable=True,
                initializer=tf.random_uniform_initializer(
                    -self.word_vec_init_scale, self.word_vec_init_scale))
            matrix_list.append(tok_embed)
            for token in self._special_tokens:
                self._word_to_ix[token] = ix
                ix += 1

        mat = []
        for word in voc:
            if word in self._word_to_ix:
                continue  # in case we already added due after seeing a capitalized version of `word`
            if word in word_to_vec:
                mat.append(word_to_vec[word])
                self._word_to_ix[word] = ix
                ix += 1
            else:
                lower = word.lower()  # Full back to the lower-case version
                if lower in word_to_vec and lower not in self._word_to_ix:
                    mat.append(word_to_vec[lower])
                    self._word_to_ix[lower] = ix
                    ix += 1

        print("Had pre-trained word embeddings for %d of %d words" %
              (len(mat), len(voc)))

        #code.interact(local=locals())
        matrix_list.append(tf.constant(value=np.vstack(mat)))

        self._word_emb_mat = tf.concat(matrix_list, axis=0)
Ejemplo n.º 2
0
    def _init(self, loader: ResourceLoader, voc: Iterable[str]):
        # TODO we should not be building variables here
        if voc is not None:
            word_to_vec = loader.load_word_vec(self.vec_name, voc)
        else:
            word_to_vec = loader.load_word_vec(self.vec_name)
            voc = set(word_to_vec.keys())

        self._word_to_ix = {}

        dim = next(iter(word_to_vec.values())).shape[0]
        if self.placeholder_flag:
            dim += 1

        null_embed = tf.zeros((1, dim), dtype=tf.float32)
        ix = 1
        matrix_list = [null_embed]

        if self._special_tokens is not None and len(self._special_tokens) > 0:
            print("Building embeddings for %d special_tokens" %
                  (len(self._special_tokens)))
            tok_embed = tf.get_variable(
                shape=(len(self._special_tokens), dim),
                name="token_embed",
                dtype=np.float32,
                trainable=True,
                initializer=tf.random_uniform_initializer(
                    -self.word_vec_init_scale, self.word_vec_init_scale))
            matrix_list.append(tok_embed)
            for token in self._special_tokens:
                self._word_to_ix[token] = ix
                ix += 1

        mat = []
        for word in voc:
            if word in self._word_to_ix:
                continue  # in case we already added due after seeing a capitalized version of `word`
            if word in word_to_vec:
                mat.append(word_to_vec[word])
                self._word_to_ix[word] = ix
                ix += 1
            else:
                lower = word.lower()  # Full back to the lower-case version
                if lower in word_to_vec and lower not in self._word_to_ix:
                    mat.append(word_to_vec[lower])
                    self._word_to_ix[lower] = ix
                    ix += 1

        print("Had pre-trained word embeddings for %d of %d words" %
              (len(mat), len(voc)))

        mat = np.vstack(mat)
        if self.placeholder_flag:
            mat = np.concatenate(
                [mat, np.zeros((len(mat), 1), dtype=np.float32)], axis=1)
        matrix_list.append(tf.constant(value=mat))

        self._placeholder_start = ix

        if self.placeholder_flag:

            def init(shape, dtype=None, partition_info=None):
                out = tf.random_normal((self.n_placeholders, dim - 1),
                                       stddev=self.placeholder_stddev)
                return tf.concat([out, tf.ones((self.n_placeholders, 1))],
                                 axis=1)

            init_fn = init
        else:
            init_fn = tf.random_normal_initializer(
                stddev=self.placeholder_stddev)

        matrix_list.append(
            tf.get_variable("placeholders",
                            (self.n_placeholders, mat.shape[1]),
                            tf.float32,
                            trainable=False,
                            initializer=init_fn))

        self._word_emb_mat = tf.concat(matrix_list, axis=0)
Ejemplo n.º 3
0
    def _init(self, loader: ResourceLoader, voc: Iterable[str],
              allow_update=False, do_update=False):
        # TODO we should not be building variables here
        if voc is not None:
            word_to_vec = loader.load_word_vec(self.vec_name, voc)
        else:
            word_to_vec = loader.load_word_vec(self.vec_name)
            voc = set(word_to_vec.keys())

        self._word_to_ix = {}

        dim = next(iter(word_to_vec.values())).shape[0]

        null_embed = tf.zeros((1, dim), dtype=tf.float32)
        if not do_update:
            self.unk_embed = tf.get_variable(
                shape=(1, dim), name="unk_embed",
                dtype=np.float32, trainable=self.learn_unk,
                initializer=tf.random_uniform_initializer(-self.word_vec_init_scale,
                                                          self.word_vec_init_scale))
        ix = 2
        matrix_list = [null_embed, self.unk_embed]

        if self._special_tokens is not None and len(self._special_tokens) > 0:
            print("Building embeddings for %d special_tokens" % (len(self._special_tokens)))
            tok_embed = tf.get_variable(shape=(len(self._special_tokens), dim), name="token_embed",
                                        dtype=np.float32, trainable=True,
                                        initializer=tf.random_uniform_initializer(-self.word_vec_init_scale,
                                                                                  self.word_vec_init_scale))
            matrix_list.append(tok_embed)
            for token in self._special_tokens:
                self._word_to_ix[token] = ix
                ix += 1

        mat = []
        for word in voc:
            if word in self._word_to_ix:
                continue  # in case we already added due after seeing a capitalized version of `word`
            if word in word_to_vec:
                mat.append(word_to_vec[word])
                self._word_to_ix[word] = ix
                ix += 1
            else:
                lower = word.lower()  # Full back to the lower-case version
                if lower in word_to_vec and lower not in self._word_to_ix:
                    mat.append(word_to_vec[lower])
                    self._word_to_ix[lower] = ix
                    ix += 1

        print("Had pre-trained word embeddings for %d of %d words" % (len(mat), len(voc)))

        # Encoder will feed this as value of self.common_word_mat
        # Allows us to quickly change the vocabulary at test time
        self.common_word_mat_np = np.vstack(mat)

        if not do_update:
            # Set up the tf graph only once
            if allow_update:
                self.common_word_mat = tf.placeholder(tf.float32, shape=(None, dim),
                                                      name='common_word_mat')

                matrix_list.append(self.common_word_mat)
            else:
                self.common_word_mat = None
                matrix_list.append(tf.constant(value=self.common_word_mat_np))
            self._word_emb_mat = tf.concat(matrix_list, axis=0)