def get_test_pipeline(self, model, tokenizer, feature_extractor): if tokenizer is None: self.skipTest("No tokenizer") return elif isinstance(model.config, (LxmertConfig, CLIPConfig, Wav2Vec2Config)): self.skipTest( "This is an Lxmert bimodal model, we need to find a more consistent way to switch on those models." ) return elif model.config.is_encoder_decoder: self.skipTest( """encoder_decoder models are trickier for this pipeline. Do we want encoder + decoder inputs to get some featues? Do we want encoder only features ? For now ignore those. """) return feature_extractor = FeatureExtractionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) return feature_extractor, ["This is a test", "This is another test"]
def run_pipeline_test(self, model, tokenizer, feature_extractor): if tokenizer is None: self.skipTest("No tokenizer") return elif isinstance(model.config, (LxmertConfig, CLIPConfig)): self.skipTest( "This is an Lxmert bimodal model, we need to find a more consistent way to switch on those models." ) return elif model.config.is_encoder_decoder: self.skipTest( """encoder_decoder models are trickier for this pipeline. Do we want encoder + decoder inputs to get some featues? Do we want encoder only features ? For now ignore those. """) return feature_extractor = FeatureExtractionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) outputs = feature_extractor("This is a test") shape = self.get_shape(outputs) self.assertEqual(shape[0], 1) outputs = feature_extractor(["This is a test", "Another test"]) shape = self.get_shape(outputs) self.assertEqual(shape[0], 2)
def _test_infer_dynamic_axis(self, model, tokenizer, framework): nlp = FeatureExtractionPipeline(model, tokenizer) variable_names = ["input_ids", "token_type_ids", "attention_mask", "output_0", "output_1"] input_vars, output_vars, shapes, tokens = infer_shapes(nlp, framework) # Assert all variables are present self.assertEqual(len(shapes), len(variable_names)) self.assertTrue(all([var_name in shapes for var_name in variable_names])) self.assertSequenceEqual(variable_names[:3], input_vars) self.assertSequenceEqual(variable_names[3:], output_vars) # Assert inputs are {0: batch, 1: sequence} for var_name in ["input_ids", "token_type_ids", "attention_mask"]: self.assertDictEqual(shapes[var_name], {0: "batch", 1: "sequence"}) # Assert outputs are {0: batch, 1: sequence} and {0: batch} self.assertDictEqual(shapes["output_0"], {0: "batch", 1: "sequence"}) self.assertDictEqual(shapes["output_1"], {0: "batch"})
def run_pipeline_test(self, model, tokenizer): if isinstance(model.config, LxmertConfig): # This is an bimodal model, we need to find a more consistent way # to switch on those models. return feature_extractor = FeatureExtractionPipeline(model=model, tokenizer=tokenizer) if feature_extractor.model.config.is_encoder_decoder: # encoder_decoder models are trickier for this pipeline. # Do we want encoder + decoder inputs to get some featues? # Do we want encoder only features ? # For now ignore those. return outputs = feature_extractor("This is a test") shape = self.get_shape(outputs) self.assertEqual(shape[0], 1) outputs = feature_extractor(["This is a test", "Another test"]) shape = self.get_shape(outputs) self.assertEqual(shape[0], 2)
with open("data/features/input.pickle", "rb") as f: arxiv_ids, categories,abstracts = pickle.load(f) corpus = abstracts batch_size = args.batch_size # 60 is due to the limitation of the GPU memory total_length = len(corpus) total_batch = int((total_length-1)/batch_size) # throw away the last little bit with open("data/features/settings.pickle", "wb") as f: pickle.dump([batch_size, total_length, total_batch], f) print("settings saved.", flush=True) model_name = "nlptown/bert-base-multilingual-uncased-sentiment" model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) pp = FeatureExtractionPipeline(model=model, tokenizer=tokenizer, device=device_id) # 0: use GPU:0 with torch.no_grad(): rets = [] for i in range(total_batch): batch_corpus = corpus[i*batch_size: (i+1)*batch_size] inputs = pp.tokenizer(batch_corpus, return_tensors=pp.framework, padding='max_length', max_length=512, truncation=True) inputs = pp.ensure_tensor_on_device(**inputs) ret = pp.model.bert(**inputs) # I only stepped into this model: "nlptown/bert-base-multilingual-uncased-sentiment" # There are two choices: output_1 is the whole activation chain, output_2 is the processed first cell. # I feel that output_1 contains much richer information, but the dimension varies according to the length of the document. output_1 = torch.flatten(ret.last_hidden_state, start_dim=1) output_2 = ret.pooler_output output_3 = torch.cat([ret.last_hidden_state[:,0],ret.last_hidden_state[:,-1]], dim=1) rets.append(output_2.cpu()) if i==0:
# Author: Sida Liu ([email protected]), 2020 # Reference: # https://huggingface.co/transformers/master/quicktour.html # https://huggingface.co/transformers/master/main_classes/feature_extractor.html import torch import matplotlib.pyplot as plt from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import FeatureExtractionPipeline model_name = "nlptown/bert-base-multilingual-uncased-sentiment" model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) pp = FeatureExtractionPipeline(model=model, tokenizer=tokenizer, device=0) # 0: use GPU:0 # Each line in the corpus is a document, here are 4 examples: corpus = """ Hello, the world! Nice to meet you! Dimension where cosine similarity is computed. Small value to avoid division by zero. """.strip().split("\n") with torch.no_grad(): inputs = pp._parse_and_tokenize(corpus) inputs = pp.ensure_tensor_on_device(**inputs) ret = pp.model.bert( **inputs ) # I only stepped into this model: "nlptown/bert-base-multilingual-uncased-sentiment" # There are two choices: output_1 is the whole activation chain, output_2 is the processed first cell.
import transformers import torch from transformers import BertModel, BertTokenizer, BertForSequenceClassification, Pipeline, FeatureExtractionPipeline tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertModel.from_pretrained("bert-base-uncased") fe_pipeline = FeatureExtractionPipeline(model, tokenizer) if __name__ == "__main__": sent = "My name is Simola Nayak and I love cats." sent2 = "Huggingface is fun. You should try it." toks1 = tokenizer.tokenize(sent) info1 = tokenizer.encode_plus(sent) info2 = tokenizer.encode_plus(sent)