コード例 #1
0
def execute_pipeline(query, n_predictions=None):
    download_bnpp_data("./data/bnpp_newsroom_v1.1/")
    download_model("bert-squad_1.1", dir="./models")
    df = pd.read_csv(
        "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv",
        converters={"paragraphs": literal_eval},
    )
    df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(reader="models/bert_qa_vCPU-sklearn.joblib")
    cdqa_pipeline.fit_retriever(df)
    if torch.cuda.is_available():
        cdqa_pipeline.cuda()
    if n_predictions is not None:
        predictions = cdqa_pipeline.predict(query, n_predictions=n_predictions)
        result = []

        for answer, title, paragraph, score in predictions:
            prediction = (answer, title)
            result.append(prediction)
        return result
    else:
        prediction = cdqa_pipeline.predict(query)
        result = (prediction[0], prediction[1])
        return result
コード例 #2
0
def test_evaluate_pipeline():

    download_bnpp_data("./data/bnpp_newsroom_v1.1/")
    download_model("bert-squad_1.1", dir="./models")
    df = pd.read_csv(
        "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv",
        converters={"paragraphs": literal_eval},
    )
    df = filter_paragraphs(df)

    test_data = {
        "data": [{
            "title":
            "BNP Paribas’ commitment to universities and schools",
            "paragraphs": [{
                "context":
                "Since January 2016, BNP Paribas has offered an Excellence Program targeting new Master’s level graduates (BAC+5) who show high potential. The aid program lasts 18 months and comprises three assignments of six months each. It serves as a strong career accelerator that enables participants to access high-level management positions at a faster rate. The program allows participants to discover the BNP Paribas Group and its various entities in France and abroad, build an internal and external network by working on different assignments and receive personalized assistance from a mentor and coaching firm at every step along the way.",
                "qas": [{
                    "answers": [
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                    ],
                    "question":
                    "Since when does the Excellence Program of BNP Paribas exist?",
                    "id":
                    "56be4db0acb8001400a502ec",
                }],
            }],
        }],
        "version":
        "1.1",
    }

    with open("./test_data.json", "w") as f:
        json.dump(test_data, f)

    cdqa_pipeline = QAPipeline(reader="./models/bert_qa_vCPU-sklearn.joblib",
                               n_jobs=-1)
    cdqa_pipeline.fit_retriever(X=df)

    eval_dict = evaluate_pipeline(cdqa_pipeline,
                                  "./test_data.json",
                                  output_dir=None)

    assert eval_dict["exact_match"] > 0.8

    assert eval_dict["f1"] > 0.8
コード例 #3
0
def fine_tuning_drive(question, file_name):
  storage.child("docs/" + file_name).download("/docs/", "docs/" + file_name)
  df = pdf_converter(directory_path="docs/")
  pd.set_option('display.max_colwidth', -1)
  df.head()
  cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)
  cdqa_pipeline.fit_retriever(df=df)
  joblib.dump(cdqa_pipeline, './models/bert_qa_custom.joblib')
  cdqa_pipeline=joblib.load('./models/bert_qa_custom.joblib')
  prediction = cdqa_pipeline.predict(question, 1)
  os.remove("docs/"+file_name)
  return prediction
コード例 #4
0
def qna(query):
    df = pdf_converter(directory_path='./media/pdf')
    df.head()
    cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)
    # Fit Retriever to documents
    cdqa_pipeline.fit_retriever(df=df)
    # INPUT QUESTION
    print("\n\n\\n", query)
    #query = 'when was the second Indian Factory Act passed?'
    prediction = cdqa_pipeline.predict(query)
    # ans = 'query: {}\n \nanswer: {} \ntitle: {} \nparagraph: {}'.format(query,prediction[0],prediction[1],prediction[2])
    ans = [query, prediction[0], prediction[1], prediction[2]]
    return ans
コード例 #5
0
    def post(self):
        parser = reqparse.RequestParser()
        parser.add_argument('query', type=str, required=True)
        args = parser.parse_args()

        df = pdf_converter(directory_path='./data/pdf/')
        cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib',
                                   max_df=1.0)

        cdqa_pipeline.fit_retriever(df=df)

        prediction = cdqa_pipeline.predict(args.query)

        return {'data': prediction}, 200
コード例 #6
0
ファイル: QABot.py プロジェクト: kvsista/max_bot
def max_qa_bot(query):
    # df = pdf_converter(directory_path='C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/pdf_files')
    df = pd.read_csv(
        'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/data/data.csv',
        converters={'paragraphs': literal_eval})
    # df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(
        reader=
        'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/models/bert_qa_vCPU-sklearn.joblib'
    )
    cdqa_pipeline.fit_retriever(df=df)

    # recognizer = sr.Recognizer()
    # # recognizer.pause_threshold = 5.0
    # with sr.Microphone() as source:
    #     # print("[search edureka: search youtube]")
    #     print("Speak Now")
    #     audio = recognizer.listen(source)
    #     query = recognizer.recognize_google(audio).capitalize()
    #     print(query)

    # query = "What is td ameritrade"
    prediction = cdqa_pipeline.predict(query)

    # print('query: {}\n'.format(query))
    # print('answer: {}\n'.format(prediction[0]))
    # print('title: {}\n'.format(prediction[1]))
    # print('paragraph: {}\n'.format(prediction[2]))

    # # Initializing the Text-to-Speech engine
    # engine = pyttsx3.init()

    # david = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0"
    # zira = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0"
    # engine.setProperty('rate', 150)
    # engine.setProperty('volume', 1.0)
    # engine.setProperty('voice', david)
    # engine.say(prediction[2])
    # engine.runAndWait()
    # engine.stop()

    # result = ('Question: {}\n'.format(query).capitalize()) + ('Answer: {}\n'.format(prediction[0]).capitalize()) + ('Subject: {}\n'.format(prediction[1]).capitalize()) + ('Paragraph: {}\n'.format(prediction[2]).capitalize())
    result = prediction[2].capitalize()
    return result
コード例 #7
0
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model

# Download model
download_model(model='bert-squad_1.1', dir='./models')

# INPUT PDFs
# Here path is the folder of the PDFs to be used
df = pdf_converter(
    directory_path='C:/Users/Viswash/Desktop/Work/ChatBot/Research/ Papers/')
df.head()

cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)

# Fit Retriever to documents
cdqa_pipeline.fit_retriever(df=df)

# INPUT QUESTION
query = 'when was the second Indian Factory Act passed?'

prediction = cdqa_pipeline.predict(query)

ans = 'query: {} \nanswer: {} \ntitle: {} \nparagraph: {}'.format(
    query, prediction[0], prediction[1], prediction[2])
print(ans)
# OUTPUT
# print('query: {}'.format(query))
# print('answer: {}'.format(prediction[0]))
# print('title: {}'.format(prediction[1]))
# print('paragraph: {}'.format(prediction[2]))
コード例 #8
0
def indexdq(request):
    if (request.POST):
        if ('file' in request.FILES):
            request.session['proj_id'] = request.POST['proj_id']
            uploaded_file = request.FILES['file']
            request.session['name'] = uploaded_file.name.split(".")[0]
            fs = FileSystemStorage()
            if not os.path.exists("media/" + str(request.user.id)):
                os.makedirs("media/" + str(request.user.id))
            filename = fs.save(
                str(request.user.id) + "/pdfs/" + uploaded_file.name,
                uploaded_file)
            uploaded_file_url = fs.url(filename)
            print(uploaded_file_url)
            print(os.getcwd())
            print(os.listdir('media/2/pdfs/'))
            df = pdf_converter(directory_path='media/' + str(request.user.id) +
                               '/pdfs/')
            print(df)

            from cdqa.utils.download import download_squad, download_model, download_bnpp_data

            directory = '/home/tanmay/Downloads'

            # Downloading data
            download_squad(dir=directory)
            download_bnpp_data(dir=directory)

            # Downloading pre-trained BERT fine-tuned on SQuAD 1.1
            download_model('bert-squad_1.1', dir=directory)

            # Downloading pre-trained DistilBERT fine-tuned on SQuAD 1.1
            download_model('distilbert-squad_1.1', dir=directory)

            cdqa_pipeline = QAPipeline(
                reader='/home/tanmay/Downloads/bert_qa.joblib'
            )  # use 'distilbert_qa.joblib' for DistilBERT instead of BERT
            cdqa_pipeline.fit_retriever(df=df)

            pkl_filename = '/home/tanmay/Downloads/' + request.session[
                'name'] + 'query.pkl'
            with open(pkl_filename, 'wb') as file:
                pickle.dump(cdqa_pipeline, file)
            cdqa_pipeline = ""
            uploaded_file = ""
            df = ""
            gc.collect()
            # joblib.dump(cdqa_pipeline, '/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            # cdqa_pipeline.dump_reader('/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            request.session[
                "model_url"] = '/home/tanmay/Downloads/' + request.session[
                    'name'] + 'query.pkl'
            rdata = {"result": "Model is trained"}
            return (JsonResponse(rdata))
        else:
            pkl_filename = request.session["model_url"]
            with open(pkl_filename, 'rb') as file:
                cdqa_pipeline = pickle.load(file)
            question = request.POST["question"]
            # cdqa_pipeline = QAPipeline(reader= request.session['model_url'])
            Ans = cdqa_pipeline.predict(question)
            cdqa_pipeline = ""
            gc.collect()
            print(Ans)
            rdata = {"one_word": Ans[0], "paragraph": Ans[2]}
            return (JsonResponse(rdata))
    else:
        return (render(request, "ml/docquery/index.html"))
コード例 #9
0
from cdqa.pipeline import QAPipeline
import pandas as pd
import pickle
import jsonify
import re

app = Flask('Customer Warriors')

dataframe_from_pkl = pd.read_pickle('./csv_of_df_scm.pkl')

with open('urldict.pickle', 'rb') as handle:
    url_dict = pickle.load(handle)

model = QAPipeline(reader='./distilbert_qa_finetuned.joblib', max_df=1.0)
model.fit_retriever(df=dataframe_from_pkl)


def show_predictions(pred, url_dict):
    return (pred[0]), (url_dict.get(pred[1])), (pred[2])


@app.route('/')
@app.route('/index.html')
def home():
    return render_template('index.html')


@app.route('/SomeSampleQnAs.html')
def show_sample_qnas():
    return render_template('SomeSampleQnAs.html')
コード例 #10
0
ファイル: model.py プロジェクト: sebbersk/Surmize
class QA:
    def __init__(self):
        # Fix in order to convert only one file at a time
        # https://github.com/cdqa-suite/cdQA/issues/224
        self.cdqa_pipeline = QAPipeline(reader=trained_weights,
                                        max_df=1,
                                        min_df=1)

    def predict(self, question):
        """
        Question function
        Inparameter: A text string containing a question

        Returns:
        A tuple of two strings, first element is the direct answer to the question
        second element is the sentence/context where the answer was found
        """
        answer, title, context, score = self.cdqa_pipeline.predict(question)
        return answer, context, score

    def convert_data(self, filepath):
        """
        Convert data files 
        to txt
        """
        filename = os.path.basename(filepath)
        name, extension = os.path.splitext(str(filename))
        root, _ = filepath.split(f"/text/{filename}")
        filepath_txt = f"{root}/text/{name}.txt"
        filepath_csv = f"{root}/csv/{name}.csv"

        if extension == ".csv":
            # csv needs to have "title" and "paragraphs" features
            df = pd.read_csv(filepath, converters={"paragraphs": literal_eval})
            df = filter_paragraphs(df)
            # https://stackoverflow.com/questions/51491931/reading-text-files-from-subfolders-and-folders-and-creating-a-dataframe-in-panda

        elif extension == ".txt" or extension == ".story":
            lines = []
            # Read file and remove non UTF-8 chars
            with open(filepath, encoding="utf8", errors='ignore') as f:
                for line in f:
                    lines.append(
                        bytes(line, "utf-8").decode("utf-8", "ignore"))
                paragraphs = lines

            # Make df to use in QA
            df = pd.DataFrame({"title": filename, "paragraphs": [paragraphs]})
            with open(filepath_txt, "w+") as f:
                for line in lines:
                    f.write(line)

        elif extension == ".pdf":
            tmp_dir = f"{root}/tmp"
            tmp_filepath = f"{tmp_dir}/{filename}"

            if not os.path.exists(tmp_dir):
                os.makedirs(tmp_dir)
            shutil.copyfile(filepath, tmp_filepath)

            df = pdf_converter(directory_path=tmp_dir)
            shutil.rmtree(tmp_dir, ignore_errors=True)
            os.remove(filepath)  # Remove original pdf file

            with open(filepath_txt, "w") as file:
                for line in df.loc[0]["paragraphs"]:
                    file.write("\n" + line)

        #df.to_csv(f"{filepath_csv}", index=False)
        self.cdqa_pipeline.fit_retriever(df=df)

    def convert_and_load(self, filepath=None, filename=None):
        self.convert_data(filepath)
        #self.load_data(filepath)

    def load_data(self, filepath=None):
        """
        Read in date file/path and determines the tile type 
        If no file type, then assumes folder contatins pdfs 
        """
        df = pd.read_csv(filepath, converters={"paragraphs": literal_eval})
        df = filter_paragraphs(df)
        self.cdqa_pipeline.fit_retriever(df=df)
コード例 #11
0
import os
from ast import literal_eval
import pandas as pd

from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline

df = pd.read_csv('esrc_pdfs.csv', converters={'paragraphs': literal_eval})

cdqa_pipeline = QAPipeline(
    reader='/resources/cdQA/bert_qa.joblib'
)  # use 'distilbert_qa.joblib' for DistilBERT instead of BERT
cdqa_pipeline.fit_retriever(df=df)  # should this be fit_reader???

cdqa_pipeline.dump_reader('/resources/cdQA/bert-reader.joblib')

prediction = cdqa_pipeline.predict(query, n_predictions=5)


def make_prediction(query, n_predictions):

    prediction = cdqa_pipeline.predict(query, n_predictions=n_predictions)

    return prediction