Ejemplo n.º 1
0
def test_evaluate_pipeline():

    download_bnpp_data("./data/bnpp_newsroom_v1.1/")
    download_model("bert-squad_1.1", dir="./models")
    df = pd.read_csv(
        "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv",
        converters={"paragraphs": literal_eval},
    )
    df = filter_paragraphs(df)

    test_data = {
        "data": [{
            "title":
            "BNP Paribas’ commitment to universities and schools",
            "paragraphs": [{
                "context":
                "Since January 2016, BNP Paribas has offered an Excellence Program targeting new Master’s level graduates (BAC+5) who show high potential. The aid program lasts 18 months and comprises three assignments of six months each. It serves as a strong career accelerator that enables participants to access high-level management positions at a faster rate. The program allows participants to discover the BNP Paribas Group and its various entities in France and abroad, build an internal and external network by working on different assignments and receive personalized assistance from a mentor and coaching firm at every step along the way.",
                "qas": [{
                    "answers": [
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                    ],
                    "question":
                    "Since when does the Excellence Program of BNP Paribas exist?",
                    "id":
                    "56be4db0acb8001400a502ec",
                }],
            }],
        }],
        "version":
        "1.1",
    }

    with open("./test_data.json", "w") as f:
        json.dump(test_data, f)

    cdqa_pipeline = QAPipeline(reader="./models/bert_qa_vCPU-sklearn.joblib",
                               n_jobs=-1)
    cdqa_pipeline.fit_retriever(X=df)

    eval_dict = evaluate_pipeline(cdqa_pipeline,
                                  "./test_data.json",
                                  output_dir=None)

    assert eval_dict["exact_match"] > 0.8

    assert eval_dict["f1"] > 0.8
def fine_tuning_drive(question, file_name):
  storage.child("docs/" + file_name).download("/docs/", "docs/" + file_name)
  df = pdf_converter(directory_path="docs/")
  pd.set_option('display.max_colwidth', -1)
  df.head()
  cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)
  cdqa_pipeline.fit_retriever(df=df)
  joblib.dump(cdqa_pipeline, './models/bert_qa_custom.joblib')
  cdqa_pipeline=joblib.load('./models/bert_qa_custom.joblib')
  prediction = cdqa_pipeline.predict(question, 1)
  os.remove("docs/"+file_name)
  return prediction
Ejemplo n.º 3
0
def qna(query):
    df = pdf_converter(directory_path='./media/pdf')
    df.head()
    cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)
    # Fit Retriever to documents
    cdqa_pipeline.fit_retriever(df=df)
    # INPUT QUESTION
    print("\n\n\\n", query)
    #query = 'when was the second Indian Factory Act passed?'
    prediction = cdqa_pipeline.predict(query)
    # ans = 'query: {}\n \nanswer: {} \ntitle: {} \nparagraph: {}'.format(query,prediction[0],prediction[1],prediction[2])
    ans = [query, prediction[0], prediction[1], prediction[2]]
    return ans
Ejemplo n.º 4
0
    def post(self):
        parser = reqparse.RequestParser()
        parser.add_argument('query', type=str, required=True)
        args = parser.parse_args()

        df = pdf_converter(directory_path='./data/pdf/')
        cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib',
                                   max_df=1.0)

        cdqa_pipeline.fit_retriever(df=df)

        prediction = cdqa_pipeline.predict(args.query)

        return {'data': prediction}, 200
Ejemplo n.º 5
0
def execute_pipeline(query, n_predictions=None):
    download_bnpp_data("./data/bnpp_newsroom_v1.1/")
    download_model("bert-squad_1.1", dir="./models")
    df = pd.read_csv(
        "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv",
        converters={"paragraphs": literal_eval},
    )
    df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(reader="models/bert_qa_vCPU-sklearn.joblib")
    cdqa_pipeline.fit_retriever(df)
    if torch.cuda.is_available():
        cdqa_pipeline.cuda()
    if n_predictions is not None:
        predictions = cdqa_pipeline.predict(query, n_predictions=n_predictions)
        result = []

        for answer, title, paragraph, score in predictions:
            prediction = (answer, title)
            result.append(prediction)
        return result
    else:
        prediction = cdqa_pipeline.predict(query)
        result = (prediction[0], prediction[1])
        return result
Ejemplo n.º 6
0
def init_QA_PIPELINE():
    """Main QA_PIPELINE intialization method, It should be called once on
    bootstraping of the system.

    @requirements:
        BERT_MODEL_PATH enviroment variable with model path
    """

    global QA_PIPELINE
    if (QA_PIPELINE == None):
        model_file = os.environ.get('BERT_MODEL_PATH', False)

        if not model_file:
            dirname = path.dirname(__file__)
            temp_file = path.join(dirname, '../models/cdqa/bert_qa.joblib')
            if not path.exists(temp_file):
                raise ">> Can't load bert model, please set BERT_MODEL_PATH env var with model path"
                return
            print('CPU Version Found')
            model_file = temp_file

        print('>>   Loading bert model..')
        qa_pipeline = QAPipeline(reader=model_file,
                                 max_df=1.0,
                                 retriever="bm25")
        print('>>   Bert Model Loaded')
        QA_PIPELINE = qa_pipeline
    return QA_PIPELINE
Ejemplo n.º 7
0
def test_evaluate_reader():

    download_model("bert-squad_1.1", dir="./models")
    cdqa_pipeline = QAPipeline(reader="./models/bert_qa.joblib", n_jobs=-1)
    eval_dict = evaluate_reader(cdqa_pipeline, "./test_data.json")

    assert eval_dict["exact_match"] > 0.8
    assert eval_dict["f1"] > 0.8
Ejemplo n.º 8
0
def max_qa_bot(query):
    # df = pdf_converter(directory_path='C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/pdf_files')
    df = pd.read_csv(
        'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/data/data.csv',
        converters={'paragraphs': literal_eval})
    # df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(
        reader=
        'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/models/bert_qa_vCPU-sklearn.joblib'
    )
    cdqa_pipeline.fit_retriever(df=df)

    # recognizer = sr.Recognizer()
    # # recognizer.pause_threshold = 5.0
    # with sr.Microphone() as source:
    #     # print("[search edureka: search youtube]")
    #     print("Speak Now")
    #     audio = recognizer.listen(source)
    #     query = recognizer.recognize_google(audio).capitalize()
    #     print(query)

    # query = "What is td ameritrade"
    prediction = cdqa_pipeline.predict(query)

    # print('query: {}\n'.format(query))
    # print('answer: {}\n'.format(prediction[0]))
    # print('title: {}\n'.format(prediction[1]))
    # print('paragraph: {}\n'.format(prediction[2]))

    # # Initializing the Text-to-Speech engine
    # engine = pyttsx3.init()

    # david = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0"
    # zira = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0"
    # engine.setProperty('rate', 150)
    # engine.setProperty('volume', 1.0)
    # engine.setProperty('voice', david)
    # engine.say(prediction[2])
    # engine.runAndWait()
    # engine.stop()

    # result = ('Question: {}\n'.format(query).capitalize()) + ('Answer: {}\n'.format(prediction[0]).capitalize()) + ('Subject: {}\n'.format(prediction[1]).capitalize()) + ('Paragraph: {}\n'.format(prediction[2]).capitalize())
    result = prediction[2].capitalize()
    return result
Ejemplo n.º 9
0
    def __init__(self, repo_dir, src_dir_suffix, model, top_n,
                 retriever_score_weight):
        print("Mining repository data")
        self.top_n = top_n
        self.retriever_score_weight = retriever_score_weight
        self.repository_miner = miner.Miner(repo_dir, src_dir_suffix)
        self.miner_data = self.repository_miner.mine()
        self.prediction_data = [[f[0], [m[0] for m in f[1]]]
                                for f in self.miner_data]
        self.result_transformer = SourceParagraphsTransformer(
            self.repository_miner.files)

        print("Fitting the pipeline")
        self.cdqa_pipeline = QAPipeline(
            reader=model,
            min_df=0.0,
            max_df=1.0,
            top_n=self.top_n,
            retriever_score_weight=retriever_score_weight)
Ejemplo n.º 10
0
    if not os.path.exists(directory):
        os.makedirs(directory)
    for url in models_url:
        wget.download(url=url, out=directory)


# download_pdf()

# Convert the pdf files into a dataframe
df = pdf_converter(directory_path='./data/pdf/')
print(df.head())
print("pdf files converted")

# Instantiate cdQA pipeline from model
cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)
# cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')

# Fit Retriever to documents
cdqa_pipeline.fit_retriever(df=df)


# pre-req setup end

# Evaluating Models
from cdqa.utils.converters import df2squad

# 1. convert pandas df into json file with SQuAD format
json_data = df2squad(df=df, squad_version='v1.1', output_dir='.', filename='dataset-name')

# 2. use annotator to add ground truth
CORS(app)

dataset_path = 'data/df_corona.csv'
reader_path = 'model/model.joblib'
project_id = os.getenv('DIALOGFLOW_PROJECT_ID')

df = pd.read_csv(dataset_path, usecols=['context', 'question'])
df = df.fillna(method='ffill')

df['paragraphs'] = df[df.columns[1:]].apply(
    lambda x: x.dropna().values.tolist(), axis=1)

df.rename(columns={"question": "title"}, inplace=True)
df.drop(columns='context', inplace=True)

cdqa_pipeline = QAPipeline(reader=reader_path)
cdqa_pipeline.fit_retriever(df=df)


def detect_intent_texts(project_id, session_id, text, language_code):
    session_client = dialogflow.SessionsClient()
    session = session_client.session_path(project_id, session_id)

    if text:
        text_input = dialogflow.types.TextInput(text=text,
                                                language_code=language_code)
        query_input = dialogflow.types.QueryInput(text=text_input)
        response = session_client.detect_intent(session=session,
                                                query_input=query_input)
        print("...................................................")
        print(response)
Ejemplo n.º 12
0
 def __init__(self):
     # Fix in order to convert only one file at a time
     # https://github.com/cdqa-suite/cdQA/issues/224
     self.cdqa_pipeline = QAPipeline(reader=trained_weights,
                                     max_df=1,
                                     min_df=1)
Ejemplo n.º 13
0
import os
from ast import literal_eval
import pandas as pd

from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline

df = pd.read_csv('esrc_pdfs.csv', converters={'paragraphs': literal_eval})

cdqa_pipeline = QAPipeline(
    reader='/resources/cdQA/bert_qa.joblib'
)  # use 'distilbert_qa.joblib' for DistilBERT instead of BERT
cdqa_pipeline.fit_retriever(df=df)  # should this be fit_reader???

cdqa_pipeline.dump_reader('/resources/cdQA/bert-reader.joblib')

prediction = cdqa_pipeline.predict(query, n_predictions=5)


def make_prediction(query, n_predictions):

    prediction = cdqa_pipeline.predict(query, n_predictions=n_predictions)

    return prediction
Ejemplo n.º 14
0
    title = item['title']
    paragraphs = []

    for paragraph in item['paragraphs']:
        paragraphs.append(paragraph['context'])

    dictionary_df.append({'title': title, 'paragraphs': paragraphs})

df = pd.DataFrame(dictionary_df)

# Get original Bert_qa and then train on our annotated dataset
wget.download(
    url=
    'https://github.com/cdqa-suite/cdQA/releases/download/bert_qa/bert_qa.joblib',
    out='./')
cdqa_pipeline = QAPipeline(reader='./bert_qa.joblib')
cdqa_pipeline.fit_retriever(df=df)
cdqa_pipeline.fit_reader('./sapiens_annotated.json')

# Use the pretrained annotated Distilbert file
#wget.download(url='https://github.com/Rathore25/Sapiens-QA/raw/main/Pretrained Data/sapiens_distilbert.joblib', out='./')
#cdqa_pipeline = QAPipeline(reader='./sapiens_distilbert.joblib')
#cdqa_pipeline.fit_retriever(df=df)

# Use the pretrained annotated Bert file
#wget.download(url='https://github.com/Rathore25/Sapiens-QA/raw/main/Pretrained Data/sapiens_bert.joblib', out='./')
#cdqa_pipeline = QAPipeline(reader='./sapiens_bert.joblib')
#cdqa_pipeline.fit_retriever(df=df)


@app.route("/api", methods=["GET"])
Ejemplo n.º 15
0
from cdqa.utils.download import download_squad, download_model, download_bnpp_data
import speech_recognition as sr
import pyttsx3

# # Downloading data
# download_squad(dir='./data')
# download_bnpp_data(dir='./data/bnpp_newsroom-v1.1')

# # Downloading pre-trained BERT fine-tuned on SQuAD 1.1
# download_model('bert-squad_1.1', dir='./models')

# df = pdf_converter(directory_path='C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/pdf_files')
df = pd.read_csv('C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/financial_data/financial_data.csv', converters={'paragraphs': literal_eval})
# df = filter_paragraphs(df)

cdqa_pipeline = QAPipeline(reader='C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/models/bert_qa_vCPU-sklearn.joblib')
cdqa_pipeline.fit_retriever(df=df)

# Initializing the Text-to-Speech engine
engine = pyttsx3.init()

david = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0"
# zira = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0"
engine.setProperty('rate', 150)
engine.setProperty('volume', 0.9)
engine.setProperty('voice', david)

recognizer = sr.Recognizer()
# recognizer.pause_threshold = 5.0
with sr.Microphone() as source:
    # print("[search edureka: search youtube]")
Ejemplo n.º 16
0
class QA:
    def __init__(self):
        # Fix in order to convert only one file at a time
        # https://github.com/cdqa-suite/cdQA/issues/224
        self.cdqa_pipeline = QAPipeline(reader=trained_weights,
                                        max_df=1,
                                        min_df=1)

    def predict(self, question):
        """
        Question function
        Inparameter: A text string containing a question

        Returns:
        A tuple of two strings, first element is the direct answer to the question
        second element is the sentence/context where the answer was found
        """
        answer, title, context, score = self.cdqa_pipeline.predict(question)
        return answer, context, score

    def convert_data(self, filepath):
        """
        Convert data files 
        to txt
        """
        filename = os.path.basename(filepath)
        name, extension = os.path.splitext(str(filename))
        root, _ = filepath.split(f"/text/{filename}")
        filepath_txt = f"{root}/text/{name}.txt"
        filepath_csv = f"{root}/csv/{name}.csv"

        if extension == ".csv":
            # csv needs to have "title" and "paragraphs" features
            df = pd.read_csv(filepath, converters={"paragraphs": literal_eval})
            df = filter_paragraphs(df)
            # https://stackoverflow.com/questions/51491931/reading-text-files-from-subfolders-and-folders-and-creating-a-dataframe-in-panda

        elif extension == ".txt" or extension == ".story":
            lines = []
            # Read file and remove non UTF-8 chars
            with open(filepath, encoding="utf8", errors='ignore') as f:
                for line in f:
                    lines.append(
                        bytes(line, "utf-8").decode("utf-8", "ignore"))
                paragraphs = lines

            # Make df to use in QA
            df = pd.DataFrame({"title": filename, "paragraphs": [paragraphs]})
            with open(filepath_txt, "w+") as f:
                for line in lines:
                    f.write(line)

        elif extension == ".pdf":
            tmp_dir = f"{root}/tmp"
            tmp_filepath = f"{tmp_dir}/{filename}"

            if not os.path.exists(tmp_dir):
                os.makedirs(tmp_dir)
            shutil.copyfile(filepath, tmp_filepath)

            df = pdf_converter(directory_path=tmp_dir)
            shutil.rmtree(tmp_dir, ignore_errors=True)
            os.remove(filepath)  # Remove original pdf file

            with open(filepath_txt, "w") as file:
                for line in df.loc[0]["paragraphs"]:
                    file.write("\n" + line)

        #df.to_csv(f"{filepath_csv}", index=False)
        self.cdqa_pipeline.fit_retriever(df=df)

    def convert_and_load(self, filepath=None, filename=None):
        self.convert_data(filepath)
        #self.load_data(filepath)

    def load_data(self, filepath=None):
        """
        Read in date file/path and determines the tile type 
        If no file type, then assumes folder contatins pdfs 
        """
        df = pd.read_csv(filepath, converters={"paragraphs": literal_eval})
        df = filter_paragraphs(df)
        self.cdqa_pipeline.fit_retriever(df=df)
Ejemplo n.º 17
0
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model

# df = pdf_converter(directory_path='./data/pdf/')

jdfshdf = [
    "The Graduate Record Examinations (GRE) is a standardized test that is an admissions requirement for many graduate schools[7] in the United States and Canada[8]. The GRE is owned and administered by Educational Testing Service (ETS).[9] The test was established in 1936 by the Carnegie Foundation for the Advancement of Teaching.[10]",
    "According to ETS, the GRE aims to measure verbal reasoning, quantitative reasoning, analytical writing, and critical thinking skills that have been acquired over a long period of learning. The content of the GRE consists of certain specific algebra, geometry, arithmetic, and vocabulary sections. The GRE General Test is offered as a computer-based exam administered at Prometric testing centers. In the graduate school admissions process, the level of emphasis that is placed upon GRE scores varies widely between schools and departments within schools. The importance of a GRE score can range from being a mere admission formality to an important selection factor.",
    "The GRE was significantly overhauled in August 2011, resulting in an exam that is not adaptive on a question-by-question basis, but rather by section, so that the performance on the first verbal and math sections determines the difficulty of the second sections presented. Overall, the test retained the sections and many of the question types from its predecessor, but the scoring scale was changed to a 130 to 170 scale (from a 200 to 800 scale).[11]",
    "The cost to take the test is US$205,[5] although ETS will reduce the fee under certain circumstances.[6] It also provides financial aid to those GRE applicants who prove economic hardship.[12] ETS does not release scores that are older than five years, although graduate program policies on the acceptance of scores older than five years will vary."
]
data = [['123', jdfshdf]]
df = pd.DataFrame(data, columns=['title', 'paragraphs'])

df.head()
cdqa_pipeline = QAPipeline(reader='./models/bert_qa_vCPU-sklearn.joblib',
                           max_df=1.0)

app = Flask(__name__)


@app.route('/train')
def train():
    tr = request.args.get('tr').split(',')
    # print(tr)
    # Fit Retriever to documents

    # Send model to GPU
    # cdqa_pipeline.cuda()

    # Fit Retriever to documents
    # print(df)
Ejemplo n.º 18
0
from ast import literal_eval

from flask import Flask, render_template, request, redirect
import json
import numpy as np
import pandas as pd
import string, re
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_squad, download_model, download_bnpp_data

app = Flask(__name__)
file_path = r'.\data.csv'
urls = []
try:
    df = pd.read_csv(file_path, converters={'paragraphs': literal_eval})
    cdqa_pipeline = QAPipeline(reader='models/distilbert_qa.joblib')
    cdqa_pipeline.fit_retriever(df=df)

except IOError:
    print('error')

#def choose_model():  #done
#model = download_model(model='bert-squad_1.1', dir='./models')
#return model

regex = re.compile('[%s]' % re.escape(string.punctuation))


def remove_punctuation(txt):
    return regex.sub('', txt)
Ejemplo n.º 19
0
from flask_restful import Api, Resource

from cdqa.pipeline import QAPipeline
import pandas as pd
import pickle
import jsonify
import re

app = Flask('Customer Warriors')

dataframe_from_pkl = pd.read_pickle('./csv_of_df_scm.pkl')

with open('urldict.pickle', 'rb') as handle:
    url_dict = pickle.load(handle)

model = QAPipeline(reader='./distilbert_qa_finetuned.joblib', max_df=1.0)
model.fit_retriever(df=dataframe_from_pkl)


def show_predictions(pred, url_dict):
    return (pred[0]), (url_dict.get(pred[1])), (pred[2])


@app.route('/')
@app.route('/index.html')
def home():
    return render_template('index.html')


@app.route('/SomeSampleQnAs.html')
def show_sample_qnas():
Ejemplo n.º 20
0
import pandas as pd
from ast import literal_eval

from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model, download_bnpp_data

download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/')
download_model(model='bert-squad_1.1', dir='./models')

df = pd.read_csv('final.csv', converters={'paragraphs': literal_eval})
print(df.head())
df2 = filter_paragraphs(df)
print(df2.head())

cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib')
cdqa_pipeline.fit_retriever(df=df2)

queries = [
    'What is known about transmission, incubation, and environmental stability?',
    'What do we know about COVID-19 risk factors?',
    'What do we know about virus genetics, origin, and evolution?',
    'What do we know about vaccines and therapeutics?',
    'What do we know about non-pharmaceutical interventions?',
    'What has been published about medical care?',
    'What do we know about diagnostics and surveillance?'
    'What has been published about information sharing and inter-sectoral collaboration?',
    'What has been published about ethical and social science considerations?'
]
for query in queries:
    prediction = cdqa_pipeline.predict(query,
Ejemplo n.º 21
0
from cdqa.utils.converters import pdf_converter
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model

# Download model
download_model(model='bert-squad_1.1', dir='./models')

# INPUT PDFs
# Here path is the folder of the PDFs to be used
df = pdf_converter(
    directory_path='C:/Users/Viswash/Desktop/Work/ChatBot/Research/ Papers/')
df.head()

cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)

# Fit Retriever to documents
cdqa_pipeline.fit_retriever(df=df)

# INPUT QUESTION
query = 'when was the second Indian Factory Act passed?'

prediction = cdqa_pipeline.predict(query)

ans = 'query: {} \nanswer: {} \ntitle: {} \nparagraph: {}'.format(
    query, prediction[0], prediction[1], prediction[2])
print(ans)
# OUTPUT
# print('query: {}'.format(query))
# print('answer: {}'.format(prediction[0]))
Ejemplo n.º 22
0
from ast import literal_eval
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline
from cdqa.utils.evaluation import f1_score, exact_match_score

# dataset
df = pd.read_csv('./data/data_augmentation.csv', converters={'paragraphs': literal_eval},encoding='utf-8')
# paragraphs 새로 정의 : Title + Paragraph
df['paragraphs_old'] = df['paragraphs']
df['paragraphs'] = df.apply(lambda row: [row['title']] + row['paragraphs_old'], axis=1).copy()

data = pd.read_csv('./data/data_augmentation.csv',encoding='utf-8')
data_sampling = data.sample(100,random_state=66)

from cdqa.retriever import TfidfRetriever, BM25Retriever
cdqa_pipeline = QAPipeline(reader='bert_qa_multi_epoch3.joblib', retrieve_by_doc=True,retriever='bm25')
cdqa_pipeline.fit_retriever(df=df)
cdqa_pipeline.cuda()
retriever = BM25Retriever(ngram_range=(1,2), max_df=0.8, min_df=3, stop_words=None,lowercase=True, top_n=5)
retriever.fit(df=df)
def f1(dataframe,dataframe2):
    number = 0
    exact_number = 0
    # score = []
    answer_list=[] 
    while number < 100:
        # print("Question?")
        question = dataframe2.iloc[number,2] # 질문
        # question = input()
        best_idx_scores = retriever.predict(question)
        prediction = df.loc[best_idx_scores.keys()]['paragraphs'].apply(lambda x:x[1]).tolist()[0].replace(u'\xa0',u'')
Ejemplo n.º 23
0
def indexdq(request):
    if (request.POST):
        if ('file' in request.FILES):
            request.session['proj_id'] = request.POST['proj_id']
            uploaded_file = request.FILES['file']
            request.session['name'] = uploaded_file.name.split(".")[0]
            fs = FileSystemStorage()
            if not os.path.exists("media/" + str(request.user.id)):
                os.makedirs("media/" + str(request.user.id))
            filename = fs.save(
                str(request.user.id) + "/pdfs/" + uploaded_file.name,
                uploaded_file)
            uploaded_file_url = fs.url(filename)
            print(uploaded_file_url)
            print(os.getcwd())
            print(os.listdir('media/2/pdfs/'))
            df = pdf_converter(directory_path='media/' + str(request.user.id) +
                               '/pdfs/')
            print(df)

            from cdqa.utils.download import download_squad, download_model, download_bnpp_data

            directory = '/home/tanmay/Downloads'

            # Downloading data
            download_squad(dir=directory)
            download_bnpp_data(dir=directory)

            # Downloading pre-trained BERT fine-tuned on SQuAD 1.1
            download_model('bert-squad_1.1', dir=directory)

            # Downloading pre-trained DistilBERT fine-tuned on SQuAD 1.1
            download_model('distilbert-squad_1.1', dir=directory)

            cdqa_pipeline = QAPipeline(
                reader='/home/tanmay/Downloads/bert_qa.joblib'
            )  # use 'distilbert_qa.joblib' for DistilBERT instead of BERT
            cdqa_pipeline.fit_retriever(df=df)

            pkl_filename = '/home/tanmay/Downloads/' + request.session[
                'name'] + 'query.pkl'
            with open(pkl_filename, 'wb') as file:
                pickle.dump(cdqa_pipeline, file)
            cdqa_pipeline = ""
            uploaded_file = ""
            df = ""
            gc.collect()
            # joblib.dump(cdqa_pipeline, '/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            # cdqa_pipeline.dump_reader('/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            request.session[
                "model_url"] = '/home/tanmay/Downloads/' + request.session[
                    'name'] + 'query.pkl'
            rdata = {"result": "Model is trained"}
            return (JsonResponse(rdata))
        else:
            pkl_filename = request.session["model_url"]
            with open(pkl_filename, 'rb') as file:
                cdqa_pipeline = pickle.load(file)
            question = request.POST["question"]
            # cdqa_pipeline = QAPipeline(reader= request.session['model_url'])
            Ans = cdqa_pipeline.predict(question)
            cdqa_pipeline = ""
            gc.collect()
            print(Ans)
            rdata = {"one_word": Ans[0], "paragraph": Ans[2]}
            return (JsonResponse(rdata))
    else:
        return (render(request, "ml/docquery/index.html"))
import dash
import dash_html_components as html
import dash_core_components as dcc
import pandas as pd
from ast import literal_eval
from cdqa.pipeline import QAPipeline


external_stylesheets = ['assets/design.css', 'spinner.css']
path_to_dataset = 'dataset.csv'
path_to_model = 'models/bert_qa.joblib'

df = pd.read_csv(path_to_dataset, converters={'paragraphs': literal_eval})
cdqa_pipeline = QAPipeline(reader=path_to_model)
cdqa_pipeline.fit_retriever(df=df)
app = dash.Dash(__name__ , external_stylesheets=external_stylesheets)

tabs_style = {
    'borderBottom': '200px',
    'height': '60px'
}


app.title = 'cdqa-app'
app.layout = html.Div([
    html.Div(html.H1('Question Answering Visualization')),
    dcc.Tabs(id='tabs', children=[
        dcc.Tab(label='Choose a question from the dropdown', value='tab-1',children = [
            html.Div([html.H6('Choose an example from the list below')],style={'marginTop': 50}),
            dcc.Dropdown(
                id='query-dropdown',