コード例 #1
0
    def test_load_pretrained_embeddings(self):
        pretrained_emb = embeddings_utils.PretrainedEmbedding(
            RAW_EMBEDDING_PATH)

        self.assertEqual(len(pretrained_emb.embed_vocab), VOCAB_SIZE)
        self.assertEqual(pretrained_emb.embed_vocab[0], "</s>")
        self.assertEqual(pretrained_emb.embed_vocab[2], "to")

        self.assertEqual(len(pretrained_emb.stoi), VOCAB_SIZE)
        self.assertEqual(pretrained_emb.stoi["</s>"], 0)
        self.assertEqual(pretrained_emb.stoi["to"], 2)

        self.assertEqual(pretrained_emb.embedding_vectors.size()[0],
                         VOCAB_SIZE)
        self.assertEqual(pretrained_emb.embedding_vectors.size()[1], EMB_SIZE)
コード例 #2
0
    def test_initialize_embeddings_weights(self):
        text_field = get_text_field()
        pretrained_emb = embeddings_utils.PretrainedEmbedding(
            RAW_EMBEDDING_PATH, text_field.lower)
        pretrained_emb_tensor = pretrained_emb.initialize_embeddings_weights(
            text_field.vocab.stoi, VocabMeta.UNK_TOKEN, EMB_SIZE,
            EmbedInitStrategy.ZERO)

        self.assertEqual(pretrained_emb_tensor.size()[0], 4)
        self.assertEqual(pretrained_emb_tensor.size()[1], EMB_SIZE)

        self.assertEqual(text_field.vocab.itos[2], "good")
        self.assertEqual(text_field.vocab.itos[3], "boy")

        self.assertEqual(text_field.vocab.stoi["good"], 2)
        self.assertEqual(text_field.vocab.stoi["boy"], 3)
コード例 #3
0
ファイル: data_handler.py プロジェクト: yuxuan2015/pytext
    def init_feature_metadata(
        self,
        train_data: textdata.Dataset,
        eval_data: textdata.Dataset,
        test_data: textdata.Dataset,
    ):
        # field metadata
        self.metadata.features = {}
        # build vocabs for features
        for name, feat in self.features.items():
            weights = None
            if feat.use_vocab:
                pretrained_embeddings = None
                pretrained_embeddings_path = getattr(
                    feat, "pretrained_embeddings_path", None)
                if pretrained_embeddings_path:
                    print("load pretrained embeddings from {}".format(
                        pretrained_embeddings_path))
                    pretrained_embeddings = embeddings_utils.PretrainedEmbedding(
                        pretrained_embeddings_path, feat.lower)

                if hasattr(feat, "vocab"):  # Don't rebuild vocab
                    print(
                        f"Vocab for feature {name} has been built. Not rebuilding."
                    )
                else:
                    print(f"Building vocab for feature {name}.")
                    vocab_data = self._get_data_to_build_vocab(
                        feat, train_data, eval_data, test_data,
                        pretrained_embeddings)
                    feat.build_vocab(*vocab_data, min_freq=feat.min_freq)
                print("{} field's vocabulary size is {}".format(
                    name, len(feat.vocab)))

                # Initialize pretrained embedding weights.
                if pretrained_embeddings:
                    weights = pretrained_embeddings.initialize_embeddings_weights(
                        feat.vocab.stoi,
                        VocabMeta.UNK_TOKEN,
                        feat.embed_dim,
                        feat.embedding_init_strategy,
                    )  # this is of type torch.Tensor

            meta = feat.get_meta()
            meta.pretrained_embeds_weight = weights
            self.metadata.features[name] = meta
コード例 #4
0
ファイル: data_handler.py プロジェクト: kwikBioInc/pytext
    def init_target_metadata(
        self,
        train_data: textdata.Dataset,
        eval_data: textdata.Dataset,
        test_data: textdata.Dataset,
    ):
        self.metadata.target = []
        # build vocabs for label fields
        for name, label in self.labels.items():
            if name in [Target.TARGET_PROB_FIELD, Target.TARGET_LOGITS_FIELD]:
                continue
            # Need test data to make sure we cover all of the labels in it
            # It is particularly important when BIO is enabled as a B-[Label] can
            # appear in train and eval but test can have B-[Label] and I-[Label]
            weights = None
            if label.use_vocab:
                if not hasattr(label, "vocab"):  # Don't rebuild vocab
                    print("Building vocab for label {}".format(name))
                    label.build_vocab(train_data, eval_data, test_data)
                else:
                    print(
                        f"Vocab for label {name} has been built. Not rebuilding."
                    )
                print("{} field's vocabulary size is {}".format(
                    name, len(label.vocab.itos)))
                pretrained_embeddings = None
                pretrained_embeddings_path = getattr(
                    label, "pretrained_embeddings_path", None)
                if pretrained_embeddings_path:
                    pretrained_embeddings = embeddings_utils.PretrainedEmbedding(
                        pretrained_embeddings_path)
                if pretrained_embeddings:
                    weights = pretrained_embeddings.initialize_embeddings_weights(
                        label.vocab.stoi,
                        label.unk_token,
                        label.embed_dim,
                        label.embedding_init_strategy,
                    )  # this is of type torch.Tensor

            meta = label.get_meta()
            meta.pretrained_embeds_weight = weights
            self.metadata.target.append(meta)
        if len(self.metadata.target) == 1:
            [self.metadata.target] = self.metadata.target