async def make(self): if not os.path.exists(self.index_directory): logger.warning("index for BERTQuestionAnswering missing") await self.index(force=True) self.qa = text.SimpleQA( index_dir=self.index_directory, bert_squad_model=self.bert_squad_model, bert_emb_model=self.bert_emb_model, )
def query_qa(params): index_dir = INDEX_DIR_MAP[params['domain']] print("Index dir: ", index_dir) if index_dir: qa = text.SimpleQA(index_dir) query = formulate_query(params) print("Query: ", query) answers = qa.ask(query, batch_size=4) print("Answers are: ", type(answers), answers[0]) return process_answers(answers=answers, threshold=float(params['thresh'])) else: return []
def init(): INDEX_DIR = DATASET_NAME + "/index" TXT_DIR = DATASET_NAME + "/txt" # if path.exists(INDEX_DIR): # shutil.rmtree(INDEX_DIR) # text.SimpleQA.initialize_index(INDEX_DIR) # file_count = sum((len(f) for _, _, f in os.walk(TXT_DIR))) # text.SimpleQA.index_from_folder(TXT_DIR, INDEX_DIR, commit_every=file_count) qa = text.SimpleQA(INDEX_DIR) return qa
def test_qa(self): from sklearn.datasets import fetch_20newsgroups remove = ('headers', 'footers', 'quotes') newsgroups_train = fetch_20newsgroups(subset='train', remove=remove) newsgroups_test = fetch_20newsgroups(subset='test', remove=remove) docs = newsgroups_train.data + newsgroups_test.data tmp_folder = '/tmp/qa_test' text.SimpleQA.initialize_index(tmp_folder) text.SimpleQA.index_from_list(docs, tmp_folder, commit_every=len(docs)) qa = text.SimpleQA(tmp_folder) answers = qa.ask('When did Cassini launch?') top_answer = answers[0]['answer'] self.assertEqual(top_answer, 'in october of 1997')
def test_qa(self): from sklearn.datasets import fetch_20newsgroups remove = ("headers", "footers", "quotes") newsgroups_train = fetch_20newsgroups(subset="train", remove=remove) newsgroups_test = fetch_20newsgroups(subset="test", remove=remove) docs = newsgroups_train.data + newsgroups_test.data # tmp_folder = '/tmp/qa_test' import shutil import tempfile tmp_folder = tempfile.mkdtemp() shutil.rmtree(tmp_folder) text.SimpleQA.initialize_index(tmp_folder) text.SimpleQA.index_from_list( docs, tmp_folder, commit_every=len(docs), multisegment=True ) qa = text.SimpleQA(tmp_folder, framework="tf") answers = qa.ask("When did Cassini launch?") top_answer = answers[0]["answer"] self.assertEqual(top_answer, "in october of 1997")
from ktrain import text from ktrain_config import DATASET_NAME import sys question = sys.argv[1] qa = text.SimpleQA(DATASET_NAME + "/index") answers = qa.ask(question=question) # qa.display_answers(answers[:5]) df = qa.answers2df(answers) # print(df) print("=" * 80) print(question) print("-" * 80) for i in range(len(df)): row = df.iloc[i] print("=" * 80) print(row["Candidate Answer"]) print("-" * 80) print(row["Context"])
def QAModel(): INDEXDIR = os.path.join(os.getcwd(), 'index') model = text.SimpleQA(INDEXDIR) return model
import os import json import ktrain from ktrain import text from operator import itemgetter from flask import Flask, jsonify, request, Response from settings import BIO_BERT_SQUAD_ML, BIO_MODEL, INDEXDIR #BIO_BERT_SQUAD_ML = "ktrapeznikov/biobert_v1.1_pubmed_squad_v2" #BIO_MODEL = 'mrm8488/GPT-2-finetuned-covid-bio-medrxiv' #app app = Flask(__name__) qa = text.SimpleQA(index_dir=INDEXDIR, \ bert_squad_model=BIO_BERT_SQUAD_ML, \ bert_emb_model= BIO_MODEL, \ from_pytorch=True) #Util Functions def create_json(_answers): # create a json array for the answers list #sort the list by the confidence score #_response = answers.sort(key=itemgetter('confidence')) answers = [] for item in _answers: new_item = {} new_item['confidence'] = float(item['confidence']) new_item['answer'] = item['full_answer'] new_item['context'] = item['context'] new_item['similarity_score'] = float(item['similarity_score']) new_item['reference'] = int(item['reference']) ### Need to fetch relevant doc from MongoDB
import os import shutil import ktrain from ktrain import text QA_MODEL = 'twmkn9/bert-base-uncased-squad2' INDEX_DIR_PATH = os.path.join("/tmp", "index_dir") docs = ['Hello world.'] # make sure INDEX_DIR_PATH not exist if os.path.exists(INDEX_DIR_PATH): shutil.rmtree(INDEX_DIR_PATH) else: pass # setup index text.SimpleQA.initialize_index(INDEX_DIR_PATH) text.SimpleQA.index_from_list(docs, INDEX_DIR_PATH, commit_every=len(docs)) # download models qa = text.SimpleQA(INDEX_DIR_PATH)
from sklearn.datasets import fetch_20newsgroups remove = ('headers', 'footers', 'quotes') newsgroups_train = fetch_20newsgroups(subset='train', remove=remove) newsgroups_test = fetch_20newsgroups(subset='test', remove=remove) docs = newsgroups_train.data + newsgroups_test.data import ktrain from ktrain import text INDEXDIR = '/tmp/myindex' text.SimpleQA.initialize_index(INDEXDIR) text.SimpleQA.index_from_list(docs, INDEXDIR, commit_every=len(docs)) qa = text.SimpleQA(INDEXDIR) answers = qa.ask('When did the Cassini probe launch?') qa.display_answers(answers[:5]) answers = qa.ask('What causes computer images to be too dark?') qa.display_answers(answers[:5]) answers = qa.ask('Who was Jesus Christ?') qa.display_answers(answers[:5]) answers = qa.ask('Who is sachin tendulkkar?') qa.display_answers(answers[:5]) answers = qa.ask('What is solar panel battery') qa.display_answers(answers[:5])
# load 20newsgroups dataset into an array from sklearn.datasets import fetch_20newsgroups from ktrain import text import os.path as path INDEX_DIRECTORY = '/tmp/newsgroups_index' qa = text.SimpleQA(INDEX_DIRECTORY) def index(index_directory): remove = ("headers", "footers", "quotes") newsgroups_train = fetch_20newsgroups(subset='train', remove=remove) newsgroups_test = fetch_20newsgroups(subset='test', remove=remove) docs = newsgroups_train.data + newsgroups_test.data text.SimpleQA.initialize_index(index_directory) text.SimpleQA.index_from_list(docs, index_directory, commit_every=100) def ask_index(question: str, max_answers: int): if not path.isdir(INDEX_DIRECTORY): index(INDEX_DIRECTORY) return qa.ask(question, n_answers=max_answers)