예제 #1
0
def init_QA_PIPELINE():
    """Main QA_PIPELINE intialization method, It should be called once on
    bootstraping of the system.

    @requirements:
        BERT_MODEL_PATH enviroment variable with model path
    """

    global QA_PIPELINE
    if (QA_PIPELINE == None):
        model_file = os.environ.get('BERT_MODEL_PATH', False)

        if not model_file:
            dirname = path.dirname(__file__)
            temp_file = path.join(dirname, '../models/cdqa/bert_qa.joblib')
            if not path.exists(temp_file):
                raise ">> Can't load bert model, please set BERT_MODEL_PATH env var with model path"
                return
            print('CPU Version Found')
            model_file = temp_file

        print('>>   Loading bert model..')
        qa_pipeline = QAPipeline(reader=model_file,
                                 max_df=1.0,
                                 retriever="bm25")
        print('>>   Bert Model Loaded')
        QA_PIPELINE = qa_pipeline
    return QA_PIPELINE
예제 #2
0
def execute_pipeline(query, n_predictions=None):
    download_bnpp_data("./data/bnpp_newsroom_v1.1/")
    download_model("bert-squad_1.1", dir="./models")
    df = pd.read_csv(
        "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv",
        converters={"paragraphs": literal_eval},
    )
    df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(reader="models/bert_qa_vCPU-sklearn.joblib")
    cdqa_pipeline.fit_retriever(df)
    if torch.cuda.is_available():
        cdqa_pipeline.cuda()
    if n_predictions is not None:
        predictions = cdqa_pipeline.predict(query, n_predictions=n_predictions)
        result = []

        for answer, title, paragraph, score in predictions:
            prediction = (answer, title)
            result.append(prediction)
        return result
    else:
        prediction = cdqa_pipeline.predict(query)
        result = (prediction[0], prediction[1])
        return result
예제 #3
0
def test_evaluate_reader():

    download_model("bert-squad_1.1", dir="./models")
    cdqa_pipeline = QAPipeline(reader="./models/bert_qa.joblib", n_jobs=-1)
    eval_dict = evaluate_reader(cdqa_pipeline, "./test_data.json")

    assert eval_dict["exact_match"] > 0.8
    assert eval_dict["f1"] > 0.8
예제 #4
0
def test_evaluate_pipeline():

    download_bnpp_data("./data/bnpp_newsroom_v1.1/")
    download_model("bert-squad_1.1", dir="./models")
    df = pd.read_csv(
        "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv",
        converters={"paragraphs": literal_eval},
    )
    df = filter_paragraphs(df)

    test_data = {
        "data": [{
            "title":
            "BNP Paribas’ commitment to universities and schools",
            "paragraphs": [{
                "context":
                "Since January 2016, BNP Paribas has offered an Excellence Program targeting new Master’s level graduates (BAC+5) who show high potential. The aid program lasts 18 months and comprises three assignments of six months each. It serves as a strong career accelerator that enables participants to access high-level management positions at a faster rate. The program allows participants to discover the BNP Paribas Group and its various entities in France and abroad, build an internal and external network by working on different assignments and receive personalized assistance from a mentor and coaching firm at every step along the way.",
                "qas": [{
                    "answers": [
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                    ],
                    "question":
                    "Since when does the Excellence Program of BNP Paribas exist?",
                    "id":
                    "56be4db0acb8001400a502ec",
                }],
            }],
        }],
        "version":
        "1.1",
    }

    with open("./test_data.json", "w") as f:
        json.dump(test_data, f)

    cdqa_pipeline = QAPipeline(reader="./models/bert_qa_vCPU-sklearn.joblib",
                               n_jobs=-1)
    cdqa_pipeline.fit_retriever(X=df)

    eval_dict = evaluate_pipeline(cdqa_pipeline,
                                  "./test_data.json",
                                  output_dir=None)

    assert eval_dict["exact_match"] > 0.8

    assert eval_dict["f1"] > 0.8
def fine_tuning_drive(question, file_name):
  storage.child("docs/" + file_name).download("/docs/", "docs/" + file_name)
  df = pdf_converter(directory_path="docs/")
  pd.set_option('display.max_colwidth', -1)
  df.head()
  cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)
  cdqa_pipeline.fit_retriever(df=df)
  joblib.dump(cdqa_pipeline, './models/bert_qa_custom.joblib')
  cdqa_pipeline=joblib.load('./models/bert_qa_custom.joblib')
  prediction = cdqa_pipeline.predict(question, 1)
  os.remove("docs/"+file_name)
  return prediction
예제 #6
0
def qna(query):
    df = pdf_converter(directory_path='./media/pdf')
    df.head()
    cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)
    # Fit Retriever to documents
    cdqa_pipeline.fit_retriever(df=df)
    # INPUT QUESTION
    print("\n\n\\n", query)
    #query = 'when was the second Indian Factory Act passed?'
    prediction = cdqa_pipeline.predict(query)
    # ans = 'query: {}\n \nanswer: {} \ntitle: {} \nparagraph: {}'.format(query,prediction[0],prediction[1],prediction[2])
    ans = [query, prediction[0], prediction[1], prediction[2]]
    return ans
예제 #7
0
    def post(self):
        parser = reqparse.RequestParser()
        parser.add_argument('query', type=str, required=True)
        args = parser.parse_args()

        df = pdf_converter(directory_path='./data/pdf/')
        cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib',
                                   max_df=1.0)

        cdqa_pipeline.fit_retriever(df=df)

        prediction = cdqa_pipeline.predict(args.query)

        return {'data': prediction}, 200
예제 #8
0
파일: QABot.py 프로젝트: kvsista/max_bot
def max_qa_bot(query):
    # df = pdf_converter(directory_path='C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/pdf_files')
    df = pd.read_csv(
        'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/data/data.csv',
        converters={'paragraphs': literal_eval})
    # df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(
        reader=
        'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/models/bert_qa_vCPU-sklearn.joblib'
    )
    cdqa_pipeline.fit_retriever(df=df)

    # recognizer = sr.Recognizer()
    # # recognizer.pause_threshold = 5.0
    # with sr.Microphone() as source:
    #     # print("[search edureka: search youtube]")
    #     print("Speak Now")
    #     audio = recognizer.listen(source)
    #     query = recognizer.recognize_google(audio).capitalize()
    #     print(query)

    # query = "What is td ameritrade"
    prediction = cdqa_pipeline.predict(query)

    # print('query: {}\n'.format(query))
    # print('answer: {}\n'.format(prediction[0]))
    # print('title: {}\n'.format(prediction[1]))
    # print('paragraph: {}\n'.format(prediction[2]))

    # # Initializing the Text-to-Speech engine
    # engine = pyttsx3.init()

    # david = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0"
    # zira = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0"
    # engine.setProperty('rate', 150)
    # engine.setProperty('volume', 1.0)
    # engine.setProperty('voice', david)
    # engine.say(prediction[2])
    # engine.runAndWait()
    # engine.stop()

    # result = ('Question: {}\n'.format(query).capitalize()) + ('Answer: {}\n'.format(prediction[0]).capitalize()) + ('Subject: {}\n'.format(prediction[1]).capitalize()) + ('Paragraph: {}\n'.format(prediction[2]).capitalize())
    result = prediction[2].capitalize()
    return result
예제 #9
0
    def __init__(self, repo_dir, src_dir_suffix, model, top_n,
                 retriever_score_weight):
        print("Mining repository data")
        self.top_n = top_n
        self.retriever_score_weight = retriever_score_weight
        self.repository_miner = miner.Miner(repo_dir, src_dir_suffix)
        self.miner_data = self.repository_miner.mine()
        self.prediction_data = [[f[0], [m[0] for m in f[1]]]
                                for f in self.miner_data]
        self.result_transformer = SourceParagraphsTransformer(
            self.repository_miner.files)

        print("Fitting the pipeline")
        self.cdqa_pipeline = QAPipeline(
            reader=model,
            min_df=0.0,
            max_df=1.0,
            top_n=self.top_n,
            retriever_score_weight=retriever_score_weight)
예제 #10
0
from cdqa.utils.converters import pdf_converter
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model

# Download model
download_model(model='bert-squad_1.1', dir='./models')

# INPUT PDFs
# Here path is the folder of the PDFs to be used
df = pdf_converter(
    directory_path='C:/Users/Viswash/Desktop/Work/ChatBot/Research/ Papers/')
df.head()

cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)

# Fit Retriever to documents
cdqa_pipeline.fit_retriever(df=df)

# INPUT QUESTION
query = 'when was the second Indian Factory Act passed?'

prediction = cdqa_pipeline.predict(query)

ans = 'query: {} \nanswer: {} \ntitle: {} \nparagraph: {}'.format(
    query, prediction[0], prediction[1], prediction[2])
print(ans)
# OUTPUT
# print('query: {}'.format(query))
# print('answer: {}'.format(prediction[0]))
예제 #11
0
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model

# df = pdf_converter(directory_path='./data/pdf/')

jdfshdf = [
    "The Graduate Record Examinations (GRE) is a standardized test that is an admissions requirement for many graduate schools[7] in the United States and Canada[8]. The GRE is owned and administered by Educational Testing Service (ETS).[9] The test was established in 1936 by the Carnegie Foundation for the Advancement of Teaching.[10]",
    "According to ETS, the GRE aims to measure verbal reasoning, quantitative reasoning, analytical writing, and critical thinking skills that have been acquired over a long period of learning. The content of the GRE consists of certain specific algebra, geometry, arithmetic, and vocabulary sections. The GRE General Test is offered as a computer-based exam administered at Prometric testing centers. In the graduate school admissions process, the level of emphasis that is placed upon GRE scores varies widely between schools and departments within schools. The importance of a GRE score can range from being a mere admission formality to an important selection factor.",
    "The GRE was significantly overhauled in August 2011, resulting in an exam that is not adaptive on a question-by-question basis, but rather by section, so that the performance on the first verbal and math sections determines the difficulty of the second sections presented. Overall, the test retained the sections and many of the question types from its predecessor, but the scoring scale was changed to a 130 to 170 scale (from a 200 to 800 scale).[11]",
    "The cost to take the test is US$205,[5] although ETS will reduce the fee under certain circumstances.[6] It also provides financial aid to those GRE applicants who prove economic hardship.[12] ETS does not release scores that are older than five years, although graduate program policies on the acceptance of scores older than five years will vary."
]
data = [['123', jdfshdf]]
df = pd.DataFrame(data, columns=['title', 'paragraphs'])

df.head()
cdqa_pipeline = QAPipeline(reader='./models/bert_qa_vCPU-sklearn.joblib',
                           max_df=1.0)

app = Flask(__name__)


@app.route('/train')
def train():
    tr = request.args.get('tr').split(',')
    # print(tr)
    # Fit Retriever to documents

    # Send model to GPU
    # cdqa_pipeline.cuda()

    # Fit Retriever to documents
    # print(df)
예제 #12
0
import os
from ast import literal_eval
import pandas as pd

from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline

df = pd.read_csv('esrc_pdfs.csv', converters={'paragraphs': literal_eval})

cdqa_pipeline = QAPipeline(
    reader='/resources/cdQA/bert_qa.joblib'
)  # use 'distilbert_qa.joblib' for DistilBERT instead of BERT
cdqa_pipeline.fit_retriever(df=df)  # should this be fit_reader???

cdqa_pipeline.dump_reader('/resources/cdQA/bert-reader.joblib')

prediction = cdqa_pipeline.predict(query, n_predictions=5)


def make_prediction(query, n_predictions):

    prediction = cdqa_pipeline.predict(query, n_predictions=n_predictions)

    return prediction
CORS(app)

dataset_path = 'data/df_corona.csv'
reader_path = 'model/model.joblib'
project_id = os.getenv('DIALOGFLOW_PROJECT_ID')

df = pd.read_csv(dataset_path, usecols=['context', 'question'])
df = df.fillna(method='ffill')

df['paragraphs'] = df[df.columns[1:]].apply(
    lambda x: x.dropna().values.tolist(), axis=1)

df.rename(columns={"question": "title"}, inplace=True)
df.drop(columns='context', inplace=True)

cdqa_pipeline = QAPipeline(reader=reader_path)
cdqa_pipeline.fit_retriever(df=df)


def detect_intent_texts(project_id, session_id, text, language_code):
    session_client = dialogflow.SessionsClient()
    session = session_client.session_path(project_id, session_id)

    if text:
        text_input = dialogflow.types.TextInput(text=text,
                                                language_code=language_code)
        query_input = dialogflow.types.QueryInput(text=text_input)
        response = session_client.detect_intent(session=session,
                                                query_input=query_input)
        print("...................................................")
        print(response)
예제 #14
0
from ast import literal_eval
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline
from cdqa.utils.evaluation import f1_score, exact_match_score

# dataset
df = pd.read_csv('./data/data_augmentation.csv', converters={'paragraphs': literal_eval},encoding='utf-8')
# paragraphs 새로 정의 : Title + Paragraph
df['paragraphs_old'] = df['paragraphs']
df['paragraphs'] = df.apply(lambda row: [row['title']] + row['paragraphs_old'], axis=1).copy()

data = pd.read_csv('./data/data_augmentation.csv',encoding='utf-8')
data_sampling = data.sample(100,random_state=66)

from cdqa.retriever import TfidfRetriever, BM25Retriever
cdqa_pipeline = QAPipeline(reader='bert_qa_multi_epoch3.joblib', retrieve_by_doc=True,retriever='bm25')
cdqa_pipeline.fit_retriever(df=df)
cdqa_pipeline.cuda()
retriever = BM25Retriever(ngram_range=(1,2), max_df=0.8, min_df=3, stop_words=None,lowercase=True, top_n=5)
retriever.fit(df=df)
def f1(dataframe,dataframe2):
    number = 0
    exact_number = 0
    # score = []
    answer_list=[] 
    while number < 100:
        # print("Question?")
        question = dataframe2.iloc[number,2] # 질문
        # question = input()
        best_idx_scores = retriever.predict(question)
        prediction = df.loc[best_idx_scores.keys()]['paragraphs'].apply(lambda x:x[1]).tolist()[0].replace(u'\xa0',u'')
예제 #15
0
파일: model.py 프로젝트: sebbersk/Surmize
 def __init__(self):
     # Fix in order to convert only one file at a time
     # https://github.com/cdqa-suite/cdQA/issues/224
     self.cdqa_pipeline = QAPipeline(reader=trained_weights,
                                     max_df=1,
                                     min_df=1)
예제 #16
0
파일: main.py 프로젝트: Cli212/VirtualHuman
    title = item['title']
    paragraphs = []

    for paragraph in item['paragraphs']:
        paragraphs.append(paragraph['context'])

    dictionary_df.append({'title': title, 'paragraphs': paragraphs})

df = pd.DataFrame(dictionary_df)

# Get original Bert_qa and then train on our annotated dataset
wget.download(
    url=
    'https://github.com/cdqa-suite/cdQA/releases/download/bert_qa/bert_qa.joblib',
    out='./')
cdqa_pipeline = QAPipeline(reader='./bert_qa.joblib')
cdqa_pipeline.fit_retriever(df=df)
cdqa_pipeline.fit_reader('./sapiens_annotated.json')

# Use the pretrained annotated Distilbert file
#wget.download(url='https://github.com/Rathore25/Sapiens-QA/raw/main/Pretrained Data/sapiens_distilbert.joblib', out='./')
#cdqa_pipeline = QAPipeline(reader='./sapiens_distilbert.joblib')
#cdqa_pipeline.fit_retriever(df=df)

# Use the pretrained annotated Bert file
#wget.download(url='https://github.com/Rathore25/Sapiens-QA/raw/main/Pretrained Data/sapiens_bert.joblib', out='./')
#cdqa_pipeline = QAPipeline(reader='./sapiens_bert.joblib')
#cdqa_pipeline.fit_retriever(df=df)


@app.route("/api", methods=["GET"])
예제 #17
0
from cdqa.utils.download import download_squad, download_model, download_bnpp_data
import speech_recognition as sr
import pyttsx3

# # Downloading data
# download_squad(dir='./data')
# download_bnpp_data(dir='./data/bnpp_newsroom-v1.1')

# # Downloading pre-trained BERT fine-tuned on SQuAD 1.1
# download_model('bert-squad_1.1', dir='./models')

# df = pdf_converter(directory_path='C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/pdf_files')
df = pd.read_csv('C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/financial_data/financial_data.csv', converters={'paragraphs': literal_eval})
# df = filter_paragraphs(df)

cdqa_pipeline = QAPipeline(reader='C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/models/bert_qa_vCPU-sklearn.joblib')
cdqa_pipeline.fit_retriever(df=df)

# Initializing the Text-to-Speech engine
engine = pyttsx3.init()

david = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0"
# zira = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0"
engine.setProperty('rate', 150)
engine.setProperty('volume', 0.9)
engine.setProperty('voice', david)

recognizer = sr.Recognizer()
# recognizer.pause_threshold = 5.0
with sr.Microphone() as source:
    # print("[search edureka: search youtube]")
예제 #18
0
from flask_restful import Api, Resource

from cdqa.pipeline import QAPipeline
import pandas as pd
import pickle
import jsonify
import re

app = Flask('Customer Warriors')

dataframe_from_pkl = pd.read_pickle('./csv_of_df_scm.pkl')

with open('urldict.pickle', 'rb') as handle:
    url_dict = pickle.load(handle)

model = QAPipeline(reader='./distilbert_qa_finetuned.joblib', max_df=1.0)
model.fit_retriever(df=dataframe_from_pkl)


def show_predictions(pred, url_dict):
    return (pred[0]), (url_dict.get(pred[1])), (pred[2])


@app.route('/')
@app.route('/index.html')
def home():
    return render_template('index.html')


@app.route('/SomeSampleQnAs.html')
def show_sample_qnas():
예제 #19
0
from ast import literal_eval

from flask import Flask, render_template, request, redirect
import json
import numpy as np
import pandas as pd
import string, re
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_squad, download_model, download_bnpp_data

app = Flask(__name__)
file_path = r'.\data.csv'
urls = []
try:
    df = pd.read_csv(file_path, converters={'paragraphs': literal_eval})
    cdqa_pipeline = QAPipeline(reader='models/distilbert_qa.joblib')
    cdqa_pipeline.fit_retriever(df=df)

except IOError:
    print('error')

#def choose_model():  #done
#model = download_model(model='bert-squad_1.1', dir='./models')
#return model

regex = re.compile('[%s]' % re.escape(string.punctuation))


def remove_punctuation(txt):
    return regex.sub('', txt)
예제 #20
0
def indexdq(request):
    if (request.POST):
        if ('file' in request.FILES):
            request.session['proj_id'] = request.POST['proj_id']
            uploaded_file = request.FILES['file']
            request.session['name'] = uploaded_file.name.split(".")[0]
            fs = FileSystemStorage()
            if not os.path.exists("media/" + str(request.user.id)):
                os.makedirs("media/" + str(request.user.id))
            filename = fs.save(
                str(request.user.id) + "/pdfs/" + uploaded_file.name,
                uploaded_file)
            uploaded_file_url = fs.url(filename)
            print(uploaded_file_url)
            print(os.getcwd())
            print(os.listdir('media/2/pdfs/'))
            df = pdf_converter(directory_path='media/' + str(request.user.id) +
                               '/pdfs/')
            print(df)

            from cdqa.utils.download import download_squad, download_model, download_bnpp_data

            directory = '/home/tanmay/Downloads'

            # Downloading data
            download_squad(dir=directory)
            download_bnpp_data(dir=directory)

            # Downloading pre-trained BERT fine-tuned on SQuAD 1.1
            download_model('bert-squad_1.1', dir=directory)

            # Downloading pre-trained DistilBERT fine-tuned on SQuAD 1.1
            download_model('distilbert-squad_1.1', dir=directory)

            cdqa_pipeline = QAPipeline(
                reader='/home/tanmay/Downloads/bert_qa.joblib'
            )  # use 'distilbert_qa.joblib' for DistilBERT instead of BERT
            cdqa_pipeline.fit_retriever(df=df)

            pkl_filename = '/home/tanmay/Downloads/' + request.session[
                'name'] + 'query.pkl'
            with open(pkl_filename, 'wb') as file:
                pickle.dump(cdqa_pipeline, file)
            cdqa_pipeline = ""
            uploaded_file = ""
            df = ""
            gc.collect()
            # joblib.dump(cdqa_pipeline, '/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            # cdqa_pipeline.dump_reader('/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            request.session[
                "model_url"] = '/home/tanmay/Downloads/' + request.session[
                    'name'] + 'query.pkl'
            rdata = {"result": "Model is trained"}
            return (JsonResponse(rdata))
        else:
            pkl_filename = request.session["model_url"]
            with open(pkl_filename, 'rb') as file:
                cdqa_pipeline = pickle.load(file)
            question = request.POST["question"]
            # cdqa_pipeline = QAPipeline(reader= request.session['model_url'])
            Ans = cdqa_pipeline.predict(question)
            cdqa_pipeline = ""
            gc.collect()
            print(Ans)
            rdata = {"one_word": Ans[0], "paragraph": Ans[2]}
            return (JsonResponse(rdata))
    else:
        return (render(request, "ml/docquery/index.html"))
import dash
import dash_html_components as html
import dash_core_components as dcc
import pandas as pd
from ast import literal_eval
from cdqa.pipeline import QAPipeline


external_stylesheets = ['assets/design.css', 'spinner.css']
path_to_dataset = 'dataset.csv'
path_to_model = 'models/bert_qa.joblib'

df = pd.read_csv(path_to_dataset, converters={'paragraphs': literal_eval})
cdqa_pipeline = QAPipeline(reader=path_to_model)
cdqa_pipeline.fit_retriever(df=df)
app = dash.Dash(__name__ , external_stylesheets=external_stylesheets)

tabs_style = {
    'borderBottom': '200px',
    'height': '60px'
}


app.title = 'cdqa-app'
app.layout = html.Div([
    html.Div(html.H1('Question Answering Visualization')),
    dcc.Tabs(id='tabs', children=[
        dcc.Tab(label='Choose a question from the dropdown', value='tab-1',children = [
            html.Div([html.H6('Choose an example from the list below')],style={'marginTop': 50}),
            dcc.Dropdown(
                id='query-dropdown',