def get_test_pipeline(self, model, tokenizer, feature_extractor):
        if tokenizer is None:
            self.skipTest("No tokenizer")
            return

        elif isinstance(model.config,
                        (LxmertConfig, CLIPConfig, Wav2Vec2Config)):
            self.skipTest(
                "This is an Lxmert bimodal model, we need to find a more consistent way to switch on those models."
            )
            return
        elif model.config.is_encoder_decoder:
            self.skipTest(
                """encoder_decoder models are trickier for this pipeline.
                Do we want encoder + decoder inputs to get some featues?
                Do we want encoder only features ?
                For now ignore those.
                """)

            return
        feature_extractor = FeatureExtractionPipeline(
            model=model,
            tokenizer=tokenizer,
            feature_extractor=feature_extractor)
        return feature_extractor, ["This is a test", "This is another test"]
    def run_pipeline_test(self, model, tokenizer, feature_extractor):
        if tokenizer is None:
            self.skipTest("No tokenizer")
            return

        elif isinstance(model.config, (LxmertConfig, CLIPConfig)):
            self.skipTest(
                "This is an Lxmert bimodal model, we need to find a more consistent way to switch on those models."
            )
            return
        elif model.config.is_encoder_decoder:
            self.skipTest(
                """encoder_decoder models are trickier for this pipeline.
                Do we want encoder + decoder inputs to get some featues?
                Do we want encoder only features ?
                For now ignore those.
                """)

            return

        feature_extractor = FeatureExtractionPipeline(
            model=model,
            tokenizer=tokenizer,
            feature_extractor=feature_extractor)

        outputs = feature_extractor("This is a test")

        shape = self.get_shape(outputs)
        self.assertEqual(shape[0], 1)

        outputs = feature_extractor(["This is a test", "Another test"])
        shape = self.get_shape(outputs)
        self.assertEqual(shape[0], 2)
Exemple #3
0
    def _test_infer_dynamic_axis(self, model, tokenizer, framework):
        nlp = FeatureExtractionPipeline(model, tokenizer)

        variable_names = ["input_ids", "token_type_ids", "attention_mask", "output_0", "output_1"]
        input_vars, output_vars, shapes, tokens = infer_shapes(nlp, framework)

        # Assert all variables are present
        self.assertEqual(len(shapes), len(variable_names))
        self.assertTrue(all([var_name in shapes for var_name in variable_names]))
        self.assertSequenceEqual(variable_names[:3], input_vars)
        self.assertSequenceEqual(variable_names[3:], output_vars)

        # Assert inputs are {0: batch, 1: sequence}
        for var_name in ["input_ids", "token_type_ids", "attention_mask"]:
            self.assertDictEqual(shapes[var_name], {0: "batch", 1: "sequence"})

        # Assert outputs are {0: batch, 1: sequence} and {0: batch}
        self.assertDictEqual(shapes["output_0"], {0: "batch", 1: "sequence"})
        self.assertDictEqual(shapes["output_1"], {0: "batch"})
Exemple #4
0
    def run_pipeline_test(self, model, tokenizer):
        if isinstance(model.config, LxmertConfig):
            # This is an bimodal model, we need to find a more consistent way
            # to switch on those models.
            return

        feature_extractor = FeatureExtractionPipeline(model=model,
                                                      tokenizer=tokenizer)
        if feature_extractor.model.config.is_encoder_decoder:
            # encoder_decoder models are trickier for this pipeline.
            # Do we want encoder + decoder inputs to get some featues?
            # Do we want encoder only features ?
            # For now ignore those.
            return

        outputs = feature_extractor("This is a test")

        shape = self.get_shape(outputs)
        self.assertEqual(shape[0], 1)

        outputs = feature_extractor(["This is a test", "Another test"])
        shape = self.get_shape(outputs)
        self.assertEqual(shape[0], 2)
Exemple #5
0
    with open("data/features/input.pickle", "rb") as f:
        arxiv_ids, categories,abstracts = pickle.load(f)

    corpus = abstracts 

    batch_size = args.batch_size # 60 is due to the limitation of the GPU memory
    total_length = len(corpus)
    total_batch = int((total_length-1)/batch_size) # throw away the last little bit
    with open("data/features/settings.pickle", "wb") as f:
        pickle.dump([batch_size, total_length, total_batch], f)
    print("settings saved.", flush=True)

    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    pp = FeatureExtractionPipeline(model=model, tokenizer=tokenizer, device=device_id) # 0: use GPU:0

    with torch.no_grad():
        rets = []
        for i in range(total_batch):
            batch_corpus = corpus[i*batch_size: (i+1)*batch_size]
            inputs = pp.tokenizer(batch_corpus, return_tensors=pp.framework, padding='max_length', max_length=512, truncation=True)
            inputs = pp.ensure_tensor_on_device(**inputs)
            ret = pp.model.bert(**inputs) # I only stepped into this model: "nlptown/bert-base-multilingual-uncased-sentiment"
            # There are two choices: output_1 is the whole activation chain, output_2 is the processed first cell.
            # I feel that output_1 contains much richer information, but the dimension varies according to the length of the document.
            output_1 = torch.flatten(ret.last_hidden_state, start_dim=1)
            output_2 = ret.pooler_output
            output_3 = torch.cat([ret.last_hidden_state[:,0],ret.last_hidden_state[:,-1]], dim=1)
            rets.append(output_2.cpu())
            if i==0:
Exemple #6
0
# Author: Sida Liu ([email protected]), 2020
# Reference:
#   https://huggingface.co/transformers/master/quicktour.html
#   https://huggingface.co/transformers/master/main_classes/feature_extractor.html
import torch
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import FeatureExtractionPipeline

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
pp = FeatureExtractionPipeline(model=model, tokenizer=tokenizer,
                               device=0)  # 0: use GPU:0

# Each line in the corpus is a document, here are 4 examples:
corpus = """
Hello, the world!
Nice to meet you!
Dimension where cosine similarity is computed.
Small value to avoid division by zero.
""".strip().split("\n")

with torch.no_grad():
    inputs = pp._parse_and_tokenize(corpus)
    inputs = pp.ensure_tensor_on_device(**inputs)
    ret = pp.model.bert(
        **inputs
    )  # I only stepped into this model: "nlptown/bert-base-multilingual-uncased-sentiment"
    # There are two choices: output_1 is the whole activation chain, output_2 is the processed first cell.
Exemple #7
0
import transformers
import torch
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, Pipeline, FeatureExtractionPipeline

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
fe_pipeline = FeatureExtractionPipeline(model, tokenizer)

if __name__ == "__main__":
    sent = "My name is Simola Nayak and I love cats."
    sent2 = "Huggingface is fun. You should try it."
    toks1 = tokenizer.tokenize(sent)
    info1 = tokenizer.encode_plus(sent)
    info2 = tokenizer.encode_plus(sent)