Example #1
0
 def test_pt_defaults(self):
     pipeline("automatic-speech-recognition", framework="pt")
Example #2
0
#importing library
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers.pipelines import pipeline
import torch

#Modelling
Tokenizer = AutoTokenizer.from_pretrained(
    "bert-large-uncased-whole-word-masking-finetuned-squad")
Model = AutoModelForQuestionAnswering.from_pretrained(
    "bert-large-uncased-whole-word-masking-finetuned-squad")

comp = pipeline('question-answering', model=Model, tokenizer=Tokenizer)

data = pd.read_csv('examples.csv')

for idx, row in data.iterrows():
    context = row['context']
    question = row['question']
    answer = comp({'question': question, 'context': context})['answer']
    print(question)
    print(answer)
Example #3
0
import pandas as pd
from transformers.pipelines import pipeline

hg_comp = pipeline('question-answering',
                   model="deepset/roberta-base-squad2",
                   tokenizer="deepset/roberta-base-squad2")

data = pd.read_csv('examples.csv')

for idx, row in data.iterrows():
    context = row['context']
    question = row['question']
    answer = hg_comp({'question': question, 'context': context})['answer']
    print(question)
    print(answer)
Example #4
0
import torch
from transformers.pipelines import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Specify model paths
local_model_path = "model/distilbert-base-uncased-finetuned-sst-2-english"

# Save the tokenizer and model locally
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

model = AutoModelForSequenceClassification.from_pretrained(local_model_path)

classifier = pipeline('sentiment-analysis',
                      model=model,
                      tokenizer=tokenizer,
                      device=0 if torch.cuda.is_available is True else -1)

output = classifier([
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
    "I love this book"
])

print(output)
Example #5
0
import pandas as pd
from transformers.pipelines import pipeline

hg_comp = pipeline(
    'question-answering',
    model="bert-large-uncased-whole-word-masking-finetuned-squad",
    tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad")

data = pd.read_csv('examples.csv')

for idx, row in data.iterrows():
    context = row['context']
    question = row['question']
    answer = hg_comp({'question': question, 'context': context})['answer']
    print(question, end='    ->   ')
    print(answer)
Example #6
0
    def test_return_timestamps_ctc_fast(self):
        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
            model="hf-internal-testing/tiny-random-wav2vec2",
        )

        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy",
                          "clean",
                          split="validation").sort("id")
        # Take short audio to keep the test readable
        audio = ds[40]["audio"]["array"][:800]

        output = speech_recognizer(audio, return_timestamps="char")
        self.assertEqual(
            output,
            {
                "text":
                "ZBT ZX G",
                "chunks": [
                    {
                        "text": " ",
                        "timestamp": (0.0, 0.012)
                    },
                    {
                        "text": "Z",
                        "timestamp": (0.012, 0.016)
                    },
                    {
                        "text": "B",
                        "timestamp": (0.016, 0.02)
                    },
                    {
                        "text": "T",
                        "timestamp": (0.02, 0.024)
                    },
                    {
                        "text": " ",
                        "timestamp": (0.024, 0.028)
                    },
                    {
                        "text": "Z",
                        "timestamp": (0.028, 0.032)
                    },
                    {
                        "text": "X",
                        "timestamp": (0.032, 0.036)
                    },
                    {
                        "text": " ",
                        "timestamp": (0.036, 0.04)
                    },
                    {
                        "text": "G",
                        "timestamp": (0.04, 0.044)
                    },
                ],
            },
        )

        output = speech_recognizer(audio, return_timestamps="word")
        self.assertEqual(
            output,
            {
                "text":
                "ZBT ZX G",
                "chunks": [
                    {
                        "text": "ZBT",
                        "timestamp": (0.012, 0.024)
                    },
                    {
                        "text": "ZX",
                        "timestamp": (0.028, 0.036)
                    },
                    {
                        "text": "G",
                        "timestamp": (0.04, 0.044)
                    },
                ],
            },
        )
Example #7
0
    def test_chunking_and_timestamps(self):
        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        tokenizer = AutoTokenizer.from_pretrained(
            "facebook/wav2vec2-base-960h")
        feature_extractor = AutoFeatureExtractor.from_pretrained(
            "facebook/wav2vec2-base-960h")
        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
            model=model,
            tokenizer=tokenizer,
            feature_extractor=feature_extractor,
            framework="pt",
            chunk_length_s=10.0,
        )

        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy",
                          "clean",
                          split="validation").sort("id")
        audio = ds[40]["audio"]["array"]

        n_repeats = 10
        audio_tiled = np.tile(audio, n_repeats)
        output = speech_recognizer([audio_tiled], batch_size=2)
        self.assertEqual(output, [{
            "text":
            ("A MAN SAID TO THE UNIVERSE SIR I EXIST " * n_repeats).strip()
        }])

        output = speech_recognizer(audio, return_timestamps="char")
        self.assertEqual(audio.shape, (74_400, ))
        self.assertEqual(speech_recognizer.feature_extractor.sampling_rate,
                         16_000)
        # The audio is 74_400 / 16_000 = 4.65s long.
        self.assertEqual(
            output,
            {
                "text":
                "A MAN SAID TO THE UNIVERSE SIR I EXIST",
                "chunks": [
                    {
                        "text": "A",
                        "timestamp": (0.6, 0.62)
                    },
                    {
                        "text": " ",
                        "timestamp": (0.62, 0.66)
                    },
                    {
                        "text": "M",
                        "timestamp": (0.68, 0.7)
                    },
                    {
                        "text": "A",
                        "timestamp": (0.78, 0.8)
                    },
                    {
                        "text": "N",
                        "timestamp": (0.84, 0.86)
                    },
                    {
                        "text": " ",
                        "timestamp": (0.92, 0.98)
                    },
                    {
                        "text": "S",
                        "timestamp": (1.06, 1.08)
                    },
                    {
                        "text": "A",
                        "timestamp": (1.14, 1.16)
                    },
                    {
                        "text": "I",
                        "timestamp": (1.16, 1.18)
                    },
                    {
                        "text": "D",
                        "timestamp": (1.2, 1.24)
                    },
                    {
                        "text": " ",
                        "timestamp": (1.24, 1.28)
                    },
                    {
                        "text": "T",
                        "timestamp": (1.28, 1.32)
                    },
                    {
                        "text": "O",
                        "timestamp": (1.34, 1.36)
                    },
                    {
                        "text": " ",
                        "timestamp": (1.38, 1.42)
                    },
                    {
                        "text": "T",
                        "timestamp": (1.42, 1.44)
                    },
                    {
                        "text": "H",
                        "timestamp": (1.44, 1.46)
                    },
                    {
                        "text": "E",
                        "timestamp": (1.46, 1.5)
                    },
                    {
                        "text": " ",
                        "timestamp": (1.5, 1.56)
                    },
                    {
                        "text": "U",
                        "timestamp": (1.58, 1.62)
                    },
                    {
                        "text": "N",
                        "timestamp": (1.64, 1.68)
                    },
                    {
                        "text": "I",
                        "timestamp": (1.7, 1.72)
                    },
                    {
                        "text": "V",
                        "timestamp": (1.76, 1.78)
                    },
                    {
                        "text": "E",
                        "timestamp": (1.84, 1.86)
                    },
                    {
                        "text": "R",
                        "timestamp": (1.86, 1.9)
                    },
                    {
                        "text": "S",
                        "timestamp": (1.96, 1.98)
                    },
                    {
                        "text": "E",
                        "timestamp": (1.98, 2.02)
                    },
                    {
                        "text": " ",
                        "timestamp": (2.02, 2.06)
                    },
                    {
                        "text": "S",
                        "timestamp": (2.82, 2.86)
                    },
                    {
                        "text": "I",
                        "timestamp": (2.94, 2.96)
                    },
                    {
                        "text": "R",
                        "timestamp": (2.98, 3.02)
                    },
                    {
                        "text": " ",
                        "timestamp": (3.06, 3.12)
                    },
                    {
                        "text": "I",
                        "timestamp": (3.5, 3.52)
                    },
                    {
                        "text": " ",
                        "timestamp": (3.58, 3.6)
                    },
                    {
                        "text": "E",
                        "timestamp": (3.66, 3.68)
                    },
                    {
                        "text": "X",
                        "timestamp": (3.68, 3.7)
                    },
                    {
                        "text": "I",
                        "timestamp": (3.9, 3.92)
                    },
                    {
                        "text": "S",
                        "timestamp": (3.94, 3.96)
                    },
                    {
                        "text": "T",
                        "timestamp": (4.0, 4.02)
                    },
                    {
                        "text": " ",
                        "timestamp": (4.06, 4.1)
                    },
                ],
            },
        )
        output = speech_recognizer(audio, return_timestamps="word")
        self.assertEqual(
            output,
            {
                "text":
                "A MAN SAID TO THE UNIVERSE SIR I EXIST",
                "chunks": [
                    {
                        "text": "A",
                        "timestamp": (0.6, 0.62)
                    },
                    {
                        "text": "MAN",
                        "timestamp": (0.68, 0.86)
                    },
                    {
                        "text": "SAID",
                        "timestamp": (1.06, 1.24)
                    },
                    {
                        "text": "TO",
                        "timestamp": (1.28, 1.36)
                    },
                    {
                        "text": "THE",
                        "timestamp": (1.42, 1.5)
                    },
                    {
                        "text": "UNIVERSE",
                        "timestamp": (1.58, 2.02)
                    },
                    {
                        "text": "SIR",
                        "timestamp": (2.82, 3.02)
                    },
                    {
                        "text": "I",
                        "timestamp": (3.5, 3.52)
                    },
                    {
                        "text": "EXIST",
                        "timestamp": (3.66, 4.02)
                    },
                ],
            },
        )
        output = speech_recognizer(audio,
                                   return_timestamps="word",
                                   chunk_length_s=2.0)
        self.assertEqual(
            output,
            {
                "text":
                "A MAN SAID TO THE UNIVERSE SIR I EXIST",
                "chunks": [
                    {
                        "text": "A",
                        "timestamp": (0.6, 0.62)
                    },
                    {
                        "text": "MAN",
                        "timestamp": (0.68, 0.86)
                    },
                    {
                        "text": "SAID",
                        "timestamp": (1.06, 1.24)
                    },
                    {
                        "text": "TO",
                        "timestamp": (1.3, 1.36)
                    },
                    {
                        "text": "THE",
                        "timestamp": (1.42, 1.48)
                    },
                    {
                        "text": "UNIVERSE",
                        "timestamp": (1.58, 2.02)
                    },
                    # Tiny change linked to chunking.
                    {
                        "text": "SIR",
                        "timestamp": (2.84, 3.02)
                    },
                    {
                        "text": "I",
                        "timestamp": (3.5, 3.52)
                    },
                    {
                        "text": "EXIST",
                        "timestamp": (3.66, 4.02)
                    },
                ],
            },
        )
Example #8
0
    def test_large_model_pt_with_lm(self):
        dataset = load_dataset("Narsil/asr_dummy")
        filename = dataset["test"][3]["file"]

        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
            model="patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm",
            framework="pt",
        )
        self.assertEqual(speech_recognizer.type, "ctc_with_lm")

        output = speech_recognizer(filename)
        self.assertEqual(
            output,
            {
                "text":
                "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumaje"
            },
        )

        # Override back to pure CTC
        speech_recognizer.type = "ctc"
        output = speech_recognizer(filename)
        # plumajre != plumaje
        self.assertEqual(
            output,
            {
                "text":
                "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumajre"
            },
        )

        speech_recognizer.type = "ctc_with_lm"
        # Simple test with CTC with LM, chunking + timestamps
        output = speech_recognizer(filename,
                                   chunk_length_s=2.0,
                                   return_timestamps="word")
        self.assertEqual(
            output,
            {
                "text":
                "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumajcri",
                "chunks": [
                    {
                        "text": "y",
                        "timestamp": (0.52, 0.54)
                    },
                    {
                        "text": "en",
                        "timestamp": (0.6, 0.68)
                    },
                    {
                        "text": "las",
                        "timestamp": (0.74, 0.84)
                    },
                    {
                        "text": "ramas",
                        "timestamp": (0.94, 1.24)
                    },
                    {
                        "text": "medio",
                        "timestamp": (1.32, 1.52)
                    },
                    {
                        "text": "sumergidas",
                        "timestamp": (1.56, 2.22)
                    },
                    {
                        "text": "revoloteaban",
                        "timestamp": (2.36, 3.0)
                    },
                    {
                        "text": "algunos",
                        "timestamp": (3.06, 3.38)
                    },
                    {
                        "text": "pájaros",
                        "timestamp": (3.46, 3.86)
                    },
                    {
                        "text": "de",
                        "timestamp": (3.92, 4.0)
                    },
                    {
                        "text": "quimérico",
                        "timestamp": (4.08, 4.6)
                    },
                    {
                        "text": "y",
                        "timestamp": (4.66, 4.68)
                    },
                    {
                        "text": "legendario",
                        "timestamp": (4.74, 5.26)
                    },
                    {
                        "text": "plumajcri",
                        "timestamp": (5.34, 5.74)
                    },
                ],
            },
        )
Example #9
0
import pandas as pd
from transformers.pipelines import pipeline

hg_comp = pipeline('question-answering',
                   model="mrm8488/bert-multi-cased-finetuned-xquadv1",
                   tokenizer="distilbert-base-uncased-distilled-squad")

data = pd.read_csv('examples.csv')

for idx, row in data.iterrows():
    context = row['context']
    question = row['question']
    answer = hg_comp({'question': question, 'context': context})['answer']
    print(answer)
Example #10
0
import pandas as pd
from transformers.pipelines import pipeline
from transformers import BertForQuestionAnswering, AutoTokenizer
from transformers import BertTokenizer, BertModel

# Model 1 - Cased Bert base model
model1 = BertForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2')
tokenizer1 = AutoTokenizer.from_pretrained('deepset/bert-base-cased-squad2')
modelBC = pipeline('question-answering', model = model1, tokenizer = tokenizer1)


#Model 2 - Uncased Bert base model
tokenizer2 = BertTokenizer.from_pretrained('bert-base-uncased')
model2 = BertModel.from_pretrained("bert-base-uncased")
modelUBC = pipeline('question-answering', model = model2, tokenizer = model2)

#Model 3 - Uncased DistilBert base model
hg_comp = pipeline('question-answering', model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased-distilled-squad")

#Reading the File
data = pd.read_csv('examples.csv')

#Preview of Data uploaded
print(data.head())

#Iterating through the dataset to generate answers on the basis of the context
for idx, row in data.iterrows():
    context = row['context']
    question = row['question']
    answer1 = modelBC({'question': question, 'context': context})['answer']
    print("Answer from Cased Bert base model: "answer1)
def answer(text, question):
    nlp_pipline = pipeline('question-answering', model=model_QnA, tokenizer=tokenizer_QnA)
    nlp_input = {'question': question, 'context': text}
    result = nlp_pipline(nlp_input)
    return result['answer']
Example #12
0
import pandas as pd
from transformers.pipelines import pipeline

#importing the file using pandas library
data = pd.read_csv('Example_Data - Sheet1.csv')

#using transformer model bert large uncased masking to predict the answer to our question and also getting the score of the answer
hg_comp = pipeline(
    'question-answering',
    model="bert-large-uncased-whole-word-masking-finetuned-squad",
    tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad")
answer = []
score = []
questions = []
for idx, row in data.iterrows():
    context = row['context']
    question = row['question']
    questions.append(question)
    answer.append(
        hg_comp({
            'question': question,
            'context': context
        })['answer'])
    score.append(hg_comp({'question': question, 'context': context})['score'])
print("")
print(
    "                                      ####### MODEL - BERT LARGE UNCASED WHOLE WORD MASKING FINE TUNED #######"
)

#printing the question, answer and score in the following format for the executed model
# QUESTION             | ANSWER   | SCORE