Beispiel #1
0
def preload_weights():
    models = {
        "Camembert_Q_A": "illuin/camembert-large-fquad",
        "Camembert": "camembert/camembert-large",
        "Bert": "bert-large-uncased",
        "Bert_Q_A": "bert-large-uncased-whole-word-masking-finetuned-squad"
    }

    for folder in models.keys():
        p = f'{WEIGHTS_PATH}/{folder}'
        if not os.path.exists(p):
            os.makedirs(p)

    if not os.path.exists(f'{WEIGHTS_PATH}/Camembert_Q_A/pytorch_model.bin'):
        QA_MODEL_NAME_FR = "illuin/camembert-large-fquad"
        QA_TOK_FR = AutoTokenizer.from_pretrained(QA_MODEL_NAME_FR)
        QA_MODEL_FR = CamembertForQuestionAnswering.from_pretrained(
            QA_MODEL_NAME_FR)
        QA_FR = QuestionAnsweringPipeline(model=QA_MODEL_FR,
                                          tokenizer=QA_TOK_FR)
        QA_FR.save_pretrained(f'{WEIGHTS_PATH}/Camembert_Q_A')
        del QA_FR
        del QA_TOK_FR
        del QA_MODEL_FR

    if not os.path.exists(f'{WEIGHTS_PATH}/Camembert/pytorch_model.bin'):
        EMB_MODEL_NAME_FR = "camembert/camembert-large"
        EMB_TOK_FR = AutoTokenizer.from_pretrained(EMB_MODEL_NAME_FR)
        EMB_FR = AutoModel.from_pretrained(EMB_MODEL_NAME_FR)
        EMB_FR.save_pretrained(f'{WEIGHTS_PATH}/Camembert')
        EMB_TOK_FR.save_pretrained(f'{WEIGHTS_PATH}/Camembert')
        del EMB_TOK_FR
        del EMB_FR

    if not os.path.exists(f'{WEIGHTS_PATH}/Bert_Q_A/pytorch_model.bin'):
        QA_MODEL_NAME_EN = "bert-large-uncased-whole-word-masking-finetuned-squad"
        QA_TOK_EN = AutoTokenizer.from_pretrained(QA_MODEL_NAME_EN)
        QA_MODEL_EN = AutoModelForQuestionAnswering.from_pretrained(
            QA_MODEL_NAME_EN)
        QA_EN = QuestionAnsweringPipeline(model=QA_MODEL_EN,
                                          tokenizer=QA_TOK_EN)
        QA_EN.save_pretrained(f'{WEIGHTS_PATH}/Bert_Q_A')
        del QA_EN
        del QA_MODEL_EN
        del QA_TOK_EN

    if not os.path.exists(f'{WEIGHTS_PATH}/Bert/pytorch_model.bin'):
        EMB_MODEL_NAME_EN = "bert-large-uncased"
        EMB_TOK_EN = AutoTokenizer.from_pretrained(EMB_MODEL_NAME_EN)
        EMB_EN = AutoModel.from_pretrained(EMB_MODEL_NAME_EN)
        EMB_EN.save_pretrained(f'{WEIGHTS_PATH}/Bert')
        EMB_TOK_EN.save_pretrained(f'{WEIGHTS_PATH}/Bert')
        del EMB_TOK_EN
        del EMB_EN
    def get_test_pipeline(self, model, tokenizer, feature_extractor):
        if isinstance(model.config, LxmertConfig):
            # This is an bimodal model, we need to find a more consistent way
            # to switch on those models.
            return None, None
        question_answerer = QuestionAnsweringPipeline(model, tokenizer)

        examples = [
            {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
            {"question": "In what field is HuggingFace ?", "context": "HuggingFace is  an AI startup."},
        ]
        return question_answerer, examples
Beispiel #3
0
def load_models():
    print(f"Loading models ")
    print(":floppy_disk: [yellow]Loading FR model [/yellow]", end="")
    print(f"on [blue]{DEVICE}[/blue]...")
    QA_TOK_FR = AutoTokenizer.from_pretrained(QA_MODEL_NAME_FR)
    QA_MODEL_FR = CamembertForQuestionAnswering.from_pretrained(
        QA_MODEL_NAME_FR)
    _LOADED_MODELS['FR'] = {
        'QNA':
        QuestionAnsweringPipeline(model=QA_MODEL_FR,
                                  tokenizer=QA_TOK_FR,
                                  device=DEVICE_PIPELINE),
        'TOK':
        AutoTokenizer.from_pretrained(EMB_MODEL_NAME_FR),
        'EMB':
        AutoModel.from_pretrained(EMB_MODEL_NAME_FR).to(DEVICE)
    }
    del QA_TOK_FR
    del QA_MODEL_FR
    print(":floppy_disk: [green]Loaded FR models [/green]")

    print(":floppy_disk: [yellow]Loading EN model [/yellow]", end="")
    print(f"on [blue]{DEVICE}[/blue]...")
    QA_TOK_EN = AutoTokenizer.from_pretrained(QA_MODEL_NAME_EN)
    QA_MODEL_EN = AutoModelForQuestionAnswering.from_pretrained(
        QA_MODEL_NAME_EN)
    _LOADED_MODELS['EN'] = {
        'QNA':
        QuestionAnsweringPipeline(model=QA_MODEL_EN,
                                  tokenizer=QA_TOK_EN,
                                  device=DEVICE_PIPELINE),
        'TOK':
        AutoTokenizer.from_pretrained(EMB_MODEL_NAME_EN),
        'EMB':
        AutoModel.from_pretrained(EMB_MODEL_NAME_EN).to(DEVICE)
    }
    del QA_TOK_EN
    del QA_MODEL_EN
    print(":floppy_disk: [green]Loaded EN models[/green]")
    def __init__(self,
                 model_type="DISTILBERT",
                 model_name="distilbert-base-cased-distilled-squad"):

        self.adaptor = get_adaptor(model_type)

        model = AutoModelForQuestionAnswering.from_pretrained(model_name)

        super().__init__(model_type, model_name, model)
        device_number = detect_cuda_device_number()

        self._pipeline = QuestionAnsweringPipeline(model=self.model,
                                                   tokenizer=self.tokenizer,
                                                   device=device_number)

        self._trainer = QATrainer(self.model, model_type, self.tokenizer,
                                  self._device, self.logger)
Beispiel #5
0
TOK_QA_EN = BertTokenizer.from_pretrained(
    "bert-large-uncased-whole-word-masking-finetuned-squad")
QA_EN = BertForQuestionAnswering.from_pretrained(
    "bert-large-uncased-whole-word-masking-finetuned-squad")

EMB_EN = EMB_EN.to(DEVICE)
QA_EN = QA_EN.to(DEVICE)
# TOK_QA_FR = CamembertTokenizer.from_pretrained("illuin/camembert-large-fquad")
# QA_FR = CamembertForQuestionAnswering.from_pretrained(
# "illuin/camembert-large-fquad")

# PIP_Q_A_FR = pipeline("question-answering", model=QA_FR, tokenizer=TOK_QA_FR)

PIP_Q_A_EN = QuestionAnsweringPipeline(model=QA_EN,
                                       tokenizer=TOK_QA_EN,
                                       device=dev_pipeline)

# TOK_EMB_FR = TOK_EMB_FR.to(DEVICE)
# EMB_FR = EMB_FR.to(DEVICE)
# PIP_Q_A_FR = PIP_Q_A_FR.to(DEVICE)


@app.post("/embeddings")
async def get_embedding(request):
    lang = request.json.get('lang')
    text = request.json.get('text')

    if lang == "fr":
        embedder = EMB_FR
        tokenizer = TOK_EMB_FR
Beispiel #6
0
bertBaseUncased = "/home/sabur/Downloads/TweetQAexperiments/bert-base-uncased-vocab.txt"
bertLargeCased = "/home/sabur/Downloads/TweetQAexperiments/bert-large-cased-vocab.txt"
bertLargeUncased = "/home/sabur/Downloads/TweetQAexperiments/bert-large-uncased-vocab.txt"
# GPT-2 vocabularies
gpt2Vocab = "gpt2-vocab.json"
gpt2LargeVocab = "gpt2-large-vocab.json"
# Instantiate a Bert tokenizers
WordPiece = BertWordPieceTokenizer(bertLargeUncased)
WordPieceEncoder = WordPiece.encode(sentence)
# Print the ids, tokens and offsets
print(WordPieceEncoder.ids)
print(WordPieceEncoder.tokens)
print(WordPieceEncoder.offsets)

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = QuestionAnsweringPipeline.from_pretrained('xlnet-base-cased')
input_ids = tf.constant(
    tokenizer.encode("Hello, my dog is cute",
                     add_special_tokens=True))[None, :]  # Batch size 1
outputs = model(input_ids)
start_scores, end_scores = outputs[:2]

trans = BertTokenizer(bertLargeUncased,
                      do_lower_case=True,
                      do_basic_tokenize=True,
                      never_split=None,
                      unk_token='[UNK]',
                      sep_token='[SEP]',
                      pad_token='[PAD]',
                      cls_token='[CLS]',
                      mask_token='[MASK]',
Beispiel #7
0
 def __init__(self, tokenizer, model):
     tokenizer = AutoTokenizer.from_pretrained(tokenizer)
     model = AutoModelForQuestionAnswering.from_pretrained(model)
     self.nlp = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
    def run_pipeline_test(self, model, tokenizer, feature_extractor):
        if isinstance(model.config, LxmertConfig):
            # This is an bimodal model, we need to find a more consistent way
            # to switch on those models.
            return
        question_answerer = QuestionAnsweringPipeline(model, tokenizer)

        outputs = question_answerer(
            question="Where was HuggingFace founded ?",
            context="HuggingFace was founded in Paris.")
        self.assertEqual(
            outputs, {
                "answer": ANY(str),
                "start": ANY(int),
                "end": ANY(int),
                "score": ANY(float)
            })

        outputs = question_answerer(
            question=[
                "In what field is HuggingFace working ?",
                "In what field is HuggingFace working ?"
            ],
            context="HuggingFace was founded in Paris.",
        )
        self.assertEqual(
            outputs,
            [
                {
                    "answer": ANY(str),
                    "start": ANY(int),
                    "end": ANY(int),
                    "score": ANY(float)
                },
                {
                    "answer": ANY(str),
                    "start": ANY(int),
                    "end": ANY(int),
                    "score": ANY(float)
                },
            ],
        )

        outputs = question_answerer(
            question=[
                "What field is HuggingFace working ?",
                "In what field is HuggingFace ?"
            ],
            context=[
                "HuggingFace is a startup based in New-York",
                "HuggingFace is a startup founded in Paris",
            ],
        )
        self.assertEqual(
            outputs,
            [
                {
                    "answer": ANY(str),
                    "start": ANY(int),
                    "end": ANY(int),
                    "score": ANY(float)
                },
                {
                    "answer": ANY(str),
                    "start": ANY(int),
                    "end": ANY(int),
                    "score": ANY(float)
                },
            ],
        )

        with self.assertRaises(ValueError):
            question_answerer(question="",
                              context="HuggingFace was founded in Paris.")
        with self.assertRaises(ValueError):
            question_answerer(question=None,
                              context="HuggingFace was founded in Paris.")
        with self.assertRaises(ValueError):
            question_answerer(
                question="In what field is HuggingFace working ?", context="")
        with self.assertRaises(ValueError):
            question_answerer(
                question="In what field is HuggingFace working ?",
                context=None)

        outputs = question_answerer(
            question="Where was HuggingFace founded ?",
            context="HuggingFace was founded in Paris.",
            topk=20)
        self.assertEqual(outputs, [{
            "answer": ANY(str),
            "start": ANY(int),
            "end": ANY(int),
            "score": ANY(float)
        } for i in range(20)])

        # Very long context require multiple features
        outputs = question_answerer(
            question="Where was HuggingFace founded ?",
            context="HuggingFace was founded in Paris." * 20)
        self.assertEqual(
            outputs, {
                "answer": ANY(str),
                "start": ANY(int),
                "end": ANY(int),
                "score": ANY(float)
            })
    'dbert-s2': 'twmkn9/distilbert-base-uncased-squad2',
    'sbert-s2': 'mrm8488/bert-small-finetuned-squadv2',
    'dbert-s1': 'distilbert-base-uncased-distilled-squad',
    '_bert-s2': 'twmkn9/bert-base-uncased-squad2',
}

models = {
    k: {'model':AutoModelForQuestionAnswering.from_pretrained(v),
        'tokenizer':AutoTokenizer.from_pretrained(v)}
    for k,v in model_names.items()
}
for k,m in models.items():
    m['model'].eval()

pipelines = {
    k: QuestionAnsweringPipeline(**v, device=-1)
    for k,v in models.items()
}

def query_all(question,context):
    """Get answer to question given context for all pipelines"""
    if isinstance(context,dict):
        context = context['text']
    ansiprint(h1('question:') + '  ' + h2(question))
    ansiprint(h1('context:'))
    ctx_ = textwrap.fill(context, 60)
    ctx_ = textwrap.indent(ctx_, ' -- ')
    print(ctx_)
    for name,pipeline in pipelines.items():
        ansiprint(h3(name))
        answer = pipeline({'question':question, 'context':context},
Beispiel #10
0
    def __init__(self,
                 sents: Union[str, SentenceStore],
                 index: Union[str, faiss.Index],
                 encoder: Optional[Union[str, SentenceEncoder]] = None,
                 cord19: Optional[CORD19] = None,
                 model: Optional[Union[BertForQuestionAnswering,
                                       PreTrainedModel]] = None,
                 tokenizer: Optional[Union[BertTokenizerFast,
                                           PreTrainedTokenizerBase]] = None,
                 model_name_or_path: Optional[str] = None,
                 max_seq_length: int = 256,
                 do_lower_case: Optional[bool] = None,
                 nlp_model: str = 'en_core_sci_sm',
                 model_device: Optional[str] = None,
                 encoder_device: Optional[str] = None,
                 **compressor_kwargs) -> None:
        """
        :param summarizer_hidden: Determines the hidden layer to use for
            embeddings. (Needs to be negative.)
        :param summarizer_reduce: Determines the reduction statistic of
            the encoding layer `(mean, median, max)`. In other words it
            determines how you want to reduce results.
        :param summarizer_kwargs: Kwargs to pass to the summarizer
            along w/input texts. Or with a `coronanlp.summarization.
            BertSummarizerArguments` instance. (These arguments can be
            overridden anytime). By either updating the properties in
            place e.g., `self.summarizer_kwargs.ratio=0.5`. Note that
            the `body` argument can be disregarded or left as None since
            it's always overridden.
        """
        self.max_seq_length = max_seq_length
        self.sents = SentenceStore.from_disk(sents) \
            if isinstance(sents, str) else sents
        assert isinstance(self.sents, SentenceStore)
        self.index = faiss.read_index(index) \
            if isinstance(index, str) else index
        assert isinstance(self.index, faiss.Index)

        sentencizer = None
        if cord19 is None and hasattr(self.sents, 'init_args'):
            cord19 = self.sents.init_cord19_dataset()
            if not cord19.sentencizer_enabled:
                cord19.init_sentencizer()
            sentencizer = cord19.sentencizer
            self.cord19 = cord19
        elif isinstance(cord19, CORD19):
            if not cord19.sentencizer_enabled:
                cord19.init_sentencizer()
            sentencizer = cord19.sentencizer
            self.cord19 = cord19
        else:
            sentencizer = SpacySentenceTokenizer(nlp_model)
        if sentencizer is not None:
            self._sentencizer = sentencizer
            self.nlp = sentencizer.nlp

        if model_name_or_path is None:
            model_name_or_path = self.default_model_name
        if do_lower_case is None:
            do_lower_case = self.do_lower_case
        if model is None or isinstance(model, str):
            self.model = BertForQuestionAnswering \
                .from_pretrained(model_name_or_path)
        elif isinstance(model, (PreTrainedModel, BertForQuestionAnswering)) \
                and self.architecture in model.config.architectures:
            self.model = model
        else:
            raise InvalidModelNameOrPathError

        if model_device is not None:
            device = torch.device(model_device)
            self.model = self.model.to(device)
        if tokenizer is None or isinstance(tokenizer, str):
            self.tokenizer = BertTokenizerFast \
                .from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
        elif isinstance(tokenizer,
                        (BertTokenizerFast, PreTrainedTokenizerBase)):
            self.tokenizer = tokenizer
        else:
            raise InvalidModelNameOrPathError

        if encoder is None:
            base: PreTrainedModel = None
            if hasattr(self.model, 'bert'):
                base = self.model.bert
            elif hasattr(self.model, 'base_model'):
                base = self.model.base_model
            base_device = base.device.type
            self.encoder = SentenceEncoder(transformer=base,
                                           tokenizer=self.tokenizer,
                                           device=base_device)
        elif isinstance(encoder, str):
            if encoder_device is None:
                encoder_device = 'cpu'
            self.encoder = SentenceEncoder \
                .from_pretrained(encoder, device=encoder_device)
        elif isinstance(encoder, SentenceEncoder):
            self.encoder = encoder
        else:
            raise InvalidModelNameOrPathError

        self.compressor = Compressor(model=self.model.base_model,
                                     **compressor_kwargs)

        # HF QAPipeline uses index, -1 is the default for CPU.
        device_index = -1
        if self.model.device.index is not None \
                and self.model.device.type == 'cuda':
            # Model using CUDA, set CUDA idx.
            device_index = self.model.device.index
        self.pipeline = QuestionAnsweringPipeline(model=self.model,
                                                  tokenizer=self.tokenizer,
                                                  device=device_index)
        self._freq_summarizer = frequency_summarizer
        self.device = self.model.device
Beispiel #11
0
 def __init__(self, model):
     self.tokenizer = AutoTokenizer.from_pretrained(model)
     self.model = AutoModelForQuestionAnswering.from_pretrained(model)
     self.bert = QuestionAnsweringPipeline(model=self.model,
                                           tokenizer=self.tokenizer)
Beispiel #12
0
from transformers import QuestionAnsweringPipeline, BertForQuestionAnswering, BertTokenizerFast
from flask import Flask, jsonify, request
from flask_cors import cross_origin, CORS
#model = pickle.load(open('modelo-qa','rb'))
name_model = "francoMG/sara-qa"
tokenizer_model = "dccuchile/bert-base-spanish-wwm-uncased"
#name_model = 'ktrapeznikov/scibert_scivocab_uncased_squad_v2'
#name_model = 'mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es'
#tokenizer_model = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"
#name_model = 'amoux/scibert_nli_squad'
model = BertForQuestionAnswering.from_pretrained(name_model)
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_model,
                                              do_lower_case=False)
nlp = QuestionAnsweringPipeline(model, tokenizer, framework="pt")
app = Flask(__name__)
CORS(app)


@app.route('/')
@cross_origin(origin='*')
def ServerStatus():
    return "Server Started"


@app.route('/preguntar', methods=['POST'])
@cross_origin(origin='*')
def preguntar():
    _pregunta = request.json['pregunta']
    score = -1
    resp = ""
    for dat in request.json['contexto']: