def test_pt_defaults(self): pipeline("automatic-speech-recognition", framework="pt")
#importing library import pandas as pd from transformers import AutoTokenizer, AutoModelForQuestionAnswering from transformers.pipelines import pipeline import torch #Modelling Tokenizer = AutoTokenizer.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") Model = AutoModelForQuestionAnswering.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") comp = pipeline('question-answering', model=Model, tokenizer=Tokenizer) data = pd.read_csv('examples.csv') for idx, row in data.iterrows(): context = row['context'] question = row['question'] answer = comp({'question': question, 'context': context})['answer'] print(question) print(answer)
import pandas as pd from transformers.pipelines import pipeline hg_comp = pipeline('question-answering', model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2") data = pd.read_csv('examples.csv') for idx, row in data.iterrows(): context = row['context'] question = row['question'] answer = hg_comp({'question': question, 'context': context})['answer'] print(question) print(answer)
import torch from transformers.pipelines import pipeline from transformers import AutoTokenizer, AutoModelForSequenceClassification # Specify model paths local_model_path = "model/distilbert-base-uncased-finetuned-sst-2-english" # Save the tokenizer and model locally tokenizer = AutoTokenizer.from_pretrained(local_model_path) model = AutoModelForSequenceClassification.from_pretrained(local_model_path) classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available is True else -1) output = classifier([ "I've been waiting for a HuggingFace course my whole life.", "I hate this so much!", "I love this book" ]) print(output)
import pandas as pd from transformers.pipelines import pipeline hg_comp = pipeline( 'question-answering', model="bert-large-uncased-whole-word-masking-finetuned-squad", tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad") data = pd.read_csv('examples.csv') for idx, row in data.iterrows(): context = row['context'] question = row['question'] answer = hg_comp({'question': question, 'context': context})['answer'] print(question, end=' -> ') print(answer)
def test_return_timestamps_ctc_fast(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="hf-internal-testing/tiny-random-wav2vec2", ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") # Take short audio to keep the test readable audio = ds[40]["audio"]["array"][:800] output = speech_recognizer(audio, return_timestamps="char") self.assertEqual( output, { "text": "ZBT ZX G", "chunks": [ { "text": " ", "timestamp": (0.0, 0.012) }, { "text": "Z", "timestamp": (0.012, 0.016) }, { "text": "B", "timestamp": (0.016, 0.02) }, { "text": "T", "timestamp": (0.02, 0.024) }, { "text": " ", "timestamp": (0.024, 0.028) }, { "text": "Z", "timestamp": (0.028, 0.032) }, { "text": "X", "timestamp": (0.032, 0.036) }, { "text": " ", "timestamp": (0.036, 0.04) }, { "text": "G", "timestamp": (0.04, 0.044) }, ], }, ) output = speech_recognizer(audio, return_timestamps="word") self.assertEqual( output, { "text": "ZBT ZX G", "chunks": [ { "text": "ZBT", "timestamp": (0.012, 0.024) }, { "text": "ZX", "timestamp": (0.028, 0.036) }, { "text": "G", "timestamp": (0.04, 0.044) }, ], }, )
def test_chunking_and_timestamps(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") tokenizer = AutoTokenizer.from_pretrained( "facebook/wav2vec2-base-960h") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/wav2vec2-base-960h") speech_recognizer = pipeline( task="automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor, framework="pt", chunk_length_s=10.0, ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 10 audio_tiled = np.tile(audio, n_repeats) output = speech_recognizer([audio_tiled], batch_size=2) self.assertEqual(output, [{ "text": ("A MAN SAID TO THE UNIVERSE SIR I EXIST " * n_repeats).strip() }]) output = speech_recognizer(audio, return_timestamps="char") self.assertEqual(audio.shape, (74_400, )) self.assertEqual(speech_recognizer.feature_extractor.sampling_rate, 16_000) # The audio is 74_400 / 16_000 = 4.65s long. self.assertEqual( output, { "text": "A MAN SAID TO THE UNIVERSE SIR I EXIST", "chunks": [ { "text": "A", "timestamp": (0.6, 0.62) }, { "text": " ", "timestamp": (0.62, 0.66) }, { "text": "M", "timestamp": (0.68, 0.7) }, { "text": "A", "timestamp": (0.78, 0.8) }, { "text": "N", "timestamp": (0.84, 0.86) }, { "text": " ", "timestamp": (0.92, 0.98) }, { "text": "S", "timestamp": (1.06, 1.08) }, { "text": "A", "timestamp": (1.14, 1.16) }, { "text": "I", "timestamp": (1.16, 1.18) }, { "text": "D", "timestamp": (1.2, 1.24) }, { "text": " ", "timestamp": (1.24, 1.28) }, { "text": "T", "timestamp": (1.28, 1.32) }, { "text": "O", "timestamp": (1.34, 1.36) }, { "text": " ", "timestamp": (1.38, 1.42) }, { "text": "T", "timestamp": (1.42, 1.44) }, { "text": "H", "timestamp": (1.44, 1.46) }, { "text": "E", "timestamp": (1.46, 1.5) }, { "text": " ", "timestamp": (1.5, 1.56) }, { "text": "U", "timestamp": (1.58, 1.62) }, { "text": "N", "timestamp": (1.64, 1.68) }, { "text": "I", "timestamp": (1.7, 1.72) }, { "text": "V", "timestamp": (1.76, 1.78) }, { "text": "E", "timestamp": (1.84, 1.86) }, { "text": "R", "timestamp": (1.86, 1.9) }, { "text": "S", "timestamp": (1.96, 1.98) }, { "text": "E", "timestamp": (1.98, 2.02) }, { "text": " ", "timestamp": (2.02, 2.06) }, { "text": "S", "timestamp": (2.82, 2.86) }, { "text": "I", "timestamp": (2.94, 2.96) }, { "text": "R", "timestamp": (2.98, 3.02) }, { "text": " ", "timestamp": (3.06, 3.12) }, { "text": "I", "timestamp": (3.5, 3.52) }, { "text": " ", "timestamp": (3.58, 3.6) }, { "text": "E", "timestamp": (3.66, 3.68) }, { "text": "X", "timestamp": (3.68, 3.7) }, { "text": "I", "timestamp": (3.9, 3.92) }, { "text": "S", "timestamp": (3.94, 3.96) }, { "text": "T", "timestamp": (4.0, 4.02) }, { "text": " ", "timestamp": (4.06, 4.1) }, ], }, ) output = speech_recognizer(audio, return_timestamps="word") self.assertEqual( output, { "text": "A MAN SAID TO THE UNIVERSE SIR I EXIST", "chunks": [ { "text": "A", "timestamp": (0.6, 0.62) }, { "text": "MAN", "timestamp": (0.68, 0.86) }, { "text": "SAID", "timestamp": (1.06, 1.24) }, { "text": "TO", "timestamp": (1.28, 1.36) }, { "text": "THE", "timestamp": (1.42, 1.5) }, { "text": "UNIVERSE", "timestamp": (1.58, 2.02) }, { "text": "SIR", "timestamp": (2.82, 3.02) }, { "text": "I", "timestamp": (3.5, 3.52) }, { "text": "EXIST", "timestamp": (3.66, 4.02) }, ], }, ) output = speech_recognizer(audio, return_timestamps="word", chunk_length_s=2.0) self.assertEqual( output, { "text": "A MAN SAID TO THE UNIVERSE SIR I EXIST", "chunks": [ { "text": "A", "timestamp": (0.6, 0.62) }, { "text": "MAN", "timestamp": (0.68, 0.86) }, { "text": "SAID", "timestamp": (1.06, 1.24) }, { "text": "TO", "timestamp": (1.3, 1.36) }, { "text": "THE", "timestamp": (1.42, 1.48) }, { "text": "UNIVERSE", "timestamp": (1.58, 2.02) }, # Tiny change linked to chunking. { "text": "SIR", "timestamp": (2.84, 3.02) }, { "text": "I", "timestamp": (3.5, 3.52) }, { "text": "EXIST", "timestamp": (3.66, 4.02) }, ], }, )
def test_large_model_pt_with_lm(self): dataset = load_dataset("Narsil/asr_dummy") filename = dataset["test"][3]["file"] speech_recognizer = pipeline( task="automatic-speech-recognition", model="patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm", framework="pt", ) self.assertEqual(speech_recognizer.type, "ctc_with_lm") output = speech_recognizer(filename) self.assertEqual( output, { "text": "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumaje" }, ) # Override back to pure CTC speech_recognizer.type = "ctc" output = speech_recognizer(filename) # plumajre != plumaje self.assertEqual( output, { "text": "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumajre" }, ) speech_recognizer.type = "ctc_with_lm" # Simple test with CTC with LM, chunking + timestamps output = speech_recognizer(filename, chunk_length_s=2.0, return_timestamps="word") self.assertEqual( output, { "text": "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumajcri", "chunks": [ { "text": "y", "timestamp": (0.52, 0.54) }, { "text": "en", "timestamp": (0.6, 0.68) }, { "text": "las", "timestamp": (0.74, 0.84) }, { "text": "ramas", "timestamp": (0.94, 1.24) }, { "text": "medio", "timestamp": (1.32, 1.52) }, { "text": "sumergidas", "timestamp": (1.56, 2.22) }, { "text": "revoloteaban", "timestamp": (2.36, 3.0) }, { "text": "algunos", "timestamp": (3.06, 3.38) }, { "text": "pájaros", "timestamp": (3.46, 3.86) }, { "text": "de", "timestamp": (3.92, 4.0) }, { "text": "quimérico", "timestamp": (4.08, 4.6) }, { "text": "y", "timestamp": (4.66, 4.68) }, { "text": "legendario", "timestamp": (4.74, 5.26) }, { "text": "plumajcri", "timestamp": (5.34, 5.74) }, ], }, )
import pandas as pd from transformers.pipelines import pipeline hg_comp = pipeline('question-answering', model="mrm8488/bert-multi-cased-finetuned-xquadv1", tokenizer="distilbert-base-uncased-distilled-squad") data = pd.read_csv('examples.csv') for idx, row in data.iterrows(): context = row['context'] question = row['question'] answer = hg_comp({'question': question, 'context': context})['answer'] print(answer)
import pandas as pd from transformers.pipelines import pipeline from transformers import BertForQuestionAnswering, AutoTokenizer from transformers import BertTokenizer, BertModel # Model 1 - Cased Bert base model model1 = BertForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2') tokenizer1 = AutoTokenizer.from_pretrained('deepset/bert-base-cased-squad2') modelBC = pipeline('question-answering', model = model1, tokenizer = tokenizer1) #Model 2 - Uncased Bert base model tokenizer2 = BertTokenizer.from_pretrained('bert-base-uncased') model2 = BertModel.from_pretrained("bert-base-uncased") modelUBC = pipeline('question-answering', model = model2, tokenizer = model2) #Model 3 - Uncased DistilBert base model hg_comp = pipeline('question-answering', model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased-distilled-squad") #Reading the File data = pd.read_csv('examples.csv') #Preview of Data uploaded print(data.head()) #Iterating through the dataset to generate answers on the basis of the context for idx, row in data.iterrows(): context = row['context'] question = row['question'] answer1 = modelBC({'question': question, 'context': context})['answer'] print("Answer from Cased Bert base model: "answer1)
def answer(text, question): nlp_pipline = pipeline('question-answering', model=model_QnA, tokenizer=tokenizer_QnA) nlp_input = {'question': question, 'context': text} result = nlp_pipline(nlp_input) return result['answer']
import pandas as pd from transformers.pipelines import pipeline #importing the file using pandas library data = pd.read_csv('Example_Data - Sheet1.csv') #using transformer model bert large uncased masking to predict the answer to our question and also getting the score of the answer hg_comp = pipeline( 'question-answering', model="bert-large-uncased-whole-word-masking-finetuned-squad", tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad") answer = [] score = [] questions = [] for idx, row in data.iterrows(): context = row['context'] question = row['question'] questions.append(question) answer.append( hg_comp({ 'question': question, 'context': context })['answer']) score.append(hg_comp({'question': question, 'context': context})['score']) print("") print( " ####### MODEL - BERT LARGE UNCASED WHOLE WORD MASKING FINE TUNED #######" ) #printing the question, answer and score in the following format for the executed model # QUESTION | ANSWER | SCORE