예제 #1
0
def create_answers(deck_id, query):
    decks_collection = db["decks"]
    slides_collection = db["slides"]
    decks = decks_collection.find({"_id": deck_id})
    target_deck = decks[0]
    author_id = target_deck["user"]
    author_slides = slides_collection.find({"user": author_id})

    df = pd.DataFrame()

    for slide in author_slides:
        revision = slide["revisions"][-1]
        usages = revision["usage"]
        for usage in usages:
            if usage["id"] == deck_id:
                content = html2text.html2text(revision["content"])
                paragraphs = content.split('\n\n')
                df = df.append(
                    {
                        "date": revision["timestamp"],
                        "title": revision["title"],
                        "category": "Infromation",
                        "link": "",
                        "abstract": "",
                        "paragraphs": paragraphs,
                        "revision_id": revision["id"],
                        "slide_id": slide["_id"]
                    },
                    ignore_index=True)
                break

    download_model(model='bert-squad_1.1', dir='./models')

    # df = filter_paragraphs(df)
    cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib',
                               max_df=0.95,
                               min_df=3)
    cdqa_pipeline.fit_retriever(df)

    predictions = cdqa_pipeline.predict(
        query, n_predictions=5)  #retriever_score_weight=0.99

    answers = []
    i = 1
    for prediction in predictions:
        slide_id = df.loc[df["title"] == prediction[1]].iloc[0]['slide_id']
        revision_id = df.loc[df["title"] ==
                             prediction[1]].iloc[0]['revision_id']
        answers.append({
            "slide_id": int(slide_id),
            "revision_id": int(revision_id),
            "answer": prediction[0],
            "title": prediction[1],
            "paragraph": prediction[2],
            "score": prediction[3]
        })
        i += 1

    return answers
예제 #2
0
def download_bert_model():
    print("Checking BERT model existence")
    if os.path.isfile(BERT_MODEL_PATH):
        print(f"Found model: {BERT_MODEL_PATH}")
    else:
        print(f"Downloading model: {BERT_MODEL_PATH}")
        download_model(model='bert-squad_1.1', dir=MODELS_DIR)
    print("Finished checking BERT model")
def get_distilbert_model():
    if not os.path.exists('./models'):
        os.makedirs('./models')
    if not os.path.exists('./models/distilbert_qa.joblib'):
        download_model(model="{}-squad_1.1".format('distilbert'),
                       dir='./models')
    return QAPipeline(reader='./models/distilbert_qa.joblib',
                      max_df=1.0,
                      min_df=1)
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


from cdqa.utils.download import download_model
download_model(model='bert-squad_1.1', dir='./models')

예제 #5
0
#pip install -r 'requirements.txt'

import streamlit as st
import numpy as np
# took out download_bnpp_data from below
# if these are in place they can be commented out?
from cdqa.utils.download import download_squad, download_model

import nltk
nltk.download('punkt')
import pandas as pd

#if these are in place, the following 4 lines can be commented out?
download_model('bert-squad_1.1')
directory = '/home/antony/u6/cdQA/data'
download_squad(dir=directory)
download_model('bert-squad_1.1', dir=directory)
download_model('distilbert-squad_1.1', dir=directory)

from ast import literal_eval
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline.cdqa_sklearn import QAPipeline
from nltk import tokenize


def load_from_csv(file):
    df = pd.read_csv(file)
    df = df.rename(str.lower, axis='columns')
    df['paragraphs'] = df['paragraphs'].apply(
        lambda x: x.replace("'s", " "
                            "s").replace("\n", " "))
예제 #6
0
def indexdq(request):
    if (request.POST):
        if ('file' in request.FILES):
            request.session['proj_id'] = request.POST['proj_id']
            uploaded_file = request.FILES['file']
            request.session['name'] = uploaded_file.name.split(".")[0]
            fs = FileSystemStorage()
            if not os.path.exists("media/" + str(request.user.id)):
                os.makedirs("media/" + str(request.user.id))
            filename = fs.save(
                str(request.user.id) + "/pdfs/" + uploaded_file.name,
                uploaded_file)
            uploaded_file_url = fs.url(filename)
            print(uploaded_file_url)
            print(os.getcwd())
            print(os.listdir('media/2/pdfs/'))
            df = pdf_converter(directory_path='media/' + str(request.user.id) +
                               '/pdfs/')
            print(df)

            from cdqa.utils.download import download_squad, download_model, download_bnpp_data

            directory = '/home/tanmay/Downloads'

            # Downloading data
            download_squad(dir=directory)
            download_bnpp_data(dir=directory)

            # Downloading pre-trained BERT fine-tuned on SQuAD 1.1
            download_model('bert-squad_1.1', dir=directory)

            # Downloading pre-trained DistilBERT fine-tuned on SQuAD 1.1
            download_model('distilbert-squad_1.1', dir=directory)

            cdqa_pipeline = QAPipeline(
                reader='/home/tanmay/Downloads/bert_qa.joblib'
            )  # use 'distilbert_qa.joblib' for DistilBERT instead of BERT
            cdqa_pipeline.fit_retriever(df=df)

            pkl_filename = '/home/tanmay/Downloads/' + request.session[
                'name'] + 'query.pkl'
            with open(pkl_filename, 'wb') as file:
                pickle.dump(cdqa_pipeline, file)
            cdqa_pipeline = ""
            uploaded_file = ""
            df = ""
            gc.collect()
            # joblib.dump(cdqa_pipeline, '/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            # cdqa_pipeline.dump_reader('/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            request.session[
                "model_url"] = '/home/tanmay/Downloads/' + request.session[
                    'name'] + 'query.pkl'
            rdata = {"result": "Model is trained"}
            return (JsonResponse(rdata))
        else:
            pkl_filename = request.session["model_url"]
            with open(pkl_filename, 'rb') as file:
                cdqa_pipeline = pickle.load(file)
            question = request.POST["question"]
            # cdqa_pipeline = QAPipeline(reader= request.session['model_url'])
            Ans = cdqa_pipeline.predict(question)
            cdqa_pipeline = ""
            gc.collect()
            print(Ans)
            rdata = {"one_word": Ans[0], "paragraph": Ans[2]}
            return (JsonResponse(rdata))
    else:
        return (render(request, "ml/docquery/index.html"))