def pdf_to_tokens(dir_name='.'): sents = [] df = pdf_converter(directory_path=dir_name) for line in df["paragraphs"][0]: sents.extend(sent_tokenize(line)) sent_tokens = [] for sent in sents: sent_tokens.append(word_tokenize(sent)) return sent_tokens
def fine_tuning_drive(question, file_name): storage.child("docs/" + file_name).download("/docs/", "docs/" + file_name) df = pdf_converter(directory_path="docs/") pd.set_option('display.max_colwidth', -1) df.head() cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) cdqa_pipeline.fit_retriever(df=df) joblib.dump(cdqa_pipeline, './models/bert_qa_custom.joblib') cdqa_pipeline=joblib.load('./models/bert_qa_custom.joblib') prediction = cdqa_pipeline.predict(question, 1) os.remove("docs/"+file_name) return prediction
def find_answer(question): # Set your path to pdf directory df = pdf_converter(directory_path='pdf_folder/') cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib') cdqa_pipeline.fit_retriever(df) query = question + '?' prediction = cdqa_pipeline.predict(query) # print('query: {}\n'.format(query)) # print('answer: {}\n'.format(prediction[0])) # print('title: {}\n'.format(prediction[1])) # print('paragraph: {}\n'.format(prediction[2])) return prediction[0]
def qna(query): df = pdf_converter(directory_path='./media/pdf') df.head() cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) # Fit Retriever to documents cdqa_pipeline.fit_retriever(df=df) # INPUT QUESTION print("\n\n\\n", query) #query = 'when was the second Indian Factory Act passed?' prediction = cdqa_pipeline.predict(query) # ans = 'query: {}\n \nanswer: {} \ntitle: {} \nparagraph: {}'.format(query,prediction[0],prediction[1],prediction[2]) ans = [query, prediction[0], prediction[1], prediction[2]] return ans
def post(self): parser = reqparse.RequestParser() parser.add_argument('query', type=str, required=True) args = parser.parse_args() df = pdf_converter(directory_path='./data/pdf/') cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) cdqa_pipeline.fit_retriever(df=df) prediction = cdqa_pipeline.predict(args.query) return {'data': prediction}, 200
def convert_data(self, filepath): """ Convert data files to txt """ filename = os.path.basename(filepath) name, extension = os.path.splitext(str(filename)) root, _ = filepath.split(f"/text/{filename}") filepath_txt = f"{root}/text/{name}.txt" filepath_csv = f"{root}/csv/{name}.csv" if extension == ".csv": # csv needs to have "title" and "paragraphs" features df = pd.read_csv(filepath, converters={"paragraphs": literal_eval}) df = filter_paragraphs(df) # https://stackoverflow.com/questions/51491931/reading-text-files-from-subfolders-and-folders-and-creating-a-dataframe-in-panda elif extension == ".txt" or extension == ".story": lines = [] # Read file and remove non UTF-8 chars with open(filepath, encoding="utf8", errors='ignore') as f: for line in f: lines.append( bytes(line, "utf-8").decode("utf-8", "ignore")) paragraphs = lines # Make df to use in QA df = pd.DataFrame({"title": filename, "paragraphs": [paragraphs]}) with open(filepath_txt, "w+") as f: for line in lines: f.write(line) elif extension == ".pdf": tmp_dir = f"{root}/tmp" tmp_filepath = f"{tmp_dir}/{filename}" if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) shutil.copyfile(filepath, tmp_filepath) df = pdf_converter(directory_path=tmp_dir) shutil.rmtree(tmp_dir, ignore_errors=True) os.remove(filepath) # Remove original pdf file with open(filepath_txt, "w") as file: for line in df.loc[0]["paragraphs"]: file.write("\n" + line) #df.to_csv(f"{filepath_csv}", index=False) self.cdqa_pipeline.fit_retriever(df=df)
def search_view(request): if request.POST: question = request.POST.get('question') for idx, url in enumerate( search(question, tld="com", num=10, stop=3, pause=2)): crawl_result(url, idx) # change path to pdfs folder df = pdf_converter(directory_path='/path/to/pdfs') cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib') cdqa_pipeline.fit_retriever(df) prediction = cdqa_pipeline.predict(question) data = {'answer': prediction[0]} return JsonResponse(data) return render(request, 'search.html')
def wholeShot(): print("UPDATING CORPUS") df = pd.read_csv("pdfs/data.csv") df2 = pdf_converter(directory_path='pdfs') df2.dropna() df2 = df2.mask(df2.eq('None')).dropna() df2 = filter_paragraphs(df2) # for rows in df2 # if row title is not in df # append row to df print(len(df2)) for index, row in df2.iterrows(): if str(row['title']) not in list(df['title']): df = df.append(row, ignore_index=True) print(df) df.to_csv("pdfs/data.csv", index=False) os.system("docker stop search-engine") os.system("docker rm search-engine") os.system("docker build -t search-engine . -f dockerfile-service") os.system( "docker run -d -p 5000:5000 -v /home/sidworld/sixgod/pdfs:/pdfs --name search-engine --rm search-engine " )
import os import pandas as pd from ast import literal_eval from cdqa.utils.converters import pdf_converter from cdqa.utils.filters import filter_paragraphs from cdqa.pipeline import QAPipeline from cdqa.utils.download import download_model # Download model download_model(model='bert-squad_1.1', dir='./models') # INPUT PDFs # Here path is the folder of the PDFs to be used df = pdf_converter( directory_path='C:/Users/Viswash/Desktop/Work/ChatBot/Research/ Papers/') df.head() cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) # Fit Retriever to documents cdqa_pipeline.fit_retriever(df=df) # INPUT QUESTION query = 'when was the second Indian Factory Act passed?' prediction = cdqa_pipeline.predict(query) ans = 'query: {} \nanswer: {} \ntitle: {} \nparagraph: {}'.format( query, prediction[0], prediction[1], prediction[2]) print(ans)
def test_pdf_converter(self): df = pdf_converter(directory_path=self.assets_folder) self.df_converter_check(df) df_line_para = pdf_converter(directory_path=self.assets_folder, include_line_breaks=True) self.df_converter_check(df_line_para, True)
'https://invest.bnpparibas.com/documents/4q18-pr-18000', 'https://invest.bnpparibas.com/documents/4q17-pr' ] print('\nDownloading PDF files...') if not os.path.exists(directory): os.makedirs(directory) for url in models_url: wget.download(url=url, out=directory) # download_pdf() # Convert the pdf files into a dataframe df = pdf_converter(directory_path='./data/pdf/') print(df.head()) print("pdf files converted") # Instantiate cdQA pipeline from model cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) # cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib') # Fit Retriever to documents cdqa_pipeline.fit_retriever(df=df) # pre-req setup end # Evaluating Models from cdqa.utils.converters import df2squad
def test_pdf_converter(self): df = pdf_converter(directory_path=self.assets_folder) self.df_converter_check(df)
import time import re from ast import literal_eval from cdqa.utils.converters import pdf_converter from cdqa.utils.filters import filter_paragraphs from cdqa.pipeline import QAPipeline from cdqa.utils.download import download_model from writeToExcel import writeToExcel # To download the pretrained BERT SQUAD Model. download_model(model='bert-squad_1.1', dir='./models') df = pdf_converter(directory_path='./dataset') cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) # Fit Retriever to documents cdqa_pipeline.fit_retriever(df=df) ======= import os import pandas as pd import time from ast import literal_eval >>>>>>> 525fc9dae07534041de4526cb6c9fddca75cc78f from cdqa.utils.converters import pdf_converter from cdqa.utils.filters import filter_paragraphs from cdqa.pipeline import QAPipeline from cdqa.utils.download import download_model
def indexdq(request): if (request.POST): if ('file' in request.FILES): request.session['proj_id'] = request.POST['proj_id'] uploaded_file = request.FILES['file'] request.session['name'] = uploaded_file.name.split(".")[0] fs = FileSystemStorage() if not os.path.exists("media/" + str(request.user.id)): os.makedirs("media/" + str(request.user.id)) filename = fs.save( str(request.user.id) + "/pdfs/" + uploaded_file.name, uploaded_file) uploaded_file_url = fs.url(filename) print(uploaded_file_url) print(os.getcwd()) print(os.listdir('media/2/pdfs/')) df = pdf_converter(directory_path='media/' + str(request.user.id) + '/pdfs/') print(df) from cdqa.utils.download import download_squad, download_model, download_bnpp_data directory = '/home/tanmay/Downloads' # Downloading data download_squad(dir=directory) download_bnpp_data(dir=directory) # Downloading pre-trained BERT fine-tuned on SQuAD 1.1 download_model('bert-squad_1.1', dir=directory) # Downloading pre-trained DistilBERT fine-tuned on SQuAD 1.1 download_model('distilbert-squad_1.1', dir=directory) cdqa_pipeline = QAPipeline( reader='/home/tanmay/Downloads/bert_qa.joblib' ) # use 'distilbert_qa.joblib' for DistilBERT instead of BERT cdqa_pipeline.fit_retriever(df=df) pkl_filename = '/home/tanmay/Downloads/' + request.session[ 'name'] + 'query.pkl' with open(pkl_filename, 'wb') as file: pickle.dump(cdqa_pipeline, file) cdqa_pipeline = "" uploaded_file = "" df = "" gc.collect() # joblib.dump(cdqa_pipeline, '/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work # cdqa_pipeline.dump_reader('/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work request.session[ "model_url"] = '/home/tanmay/Downloads/' + request.session[ 'name'] + 'query.pkl' rdata = {"result": "Model is trained"} return (JsonResponse(rdata)) else: pkl_filename = request.session["model_url"] with open(pkl_filename, 'rb') as file: cdqa_pipeline = pickle.load(file) question = request.POST["question"] # cdqa_pipeline = QAPipeline(reader= request.session['model_url']) Ans = cdqa_pipeline.predict(question) cdqa_pipeline = "" gc.collect() print(Ans) rdata = {"one_word": Ans[0], "paragraph": Ans[2]} return (JsonResponse(rdata)) else: return (render(request, "ml/docquery/index.html"))
import pandas as pd from ast import literal_eval from cdqa.utils.filters import filter_paragraphs from cdqa.utils.download import download_model, download_bnpp_data from cdqa.pipeline.cdqa_sklearn import QAPipeline from cdqa.utils.converters import df2squad from cdqa.utils.converters import pdf_converter # Download data and models download_model(model='bert-squad_1.1', dir='./models') # Loading data and filtering / preprocessing the documents df = pdf_converter( directory_path='/mnt/c/Users/amerj/Documents/python_atom/syllabus_files') # Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1 cdqa_pipeline = QAPipeline(reader='models/bert_qa_vGPU-sklearn.joblib') # Fitting the retriever to the list of documents in the dataframe cdqa_pipeline.fit_retriever(df) # Sending a question to the pipeline and getting prediction query = 'Where are quizzes and exams?' prediction = cdqa_pipeline.predict(query) print('query: {}\n'.format(query)) print('answer: {}\n'.format(prediction[0])) print('title: {}\n'.format(prediction[1])) print('paragraph: {}\n'.format(prediction[2]))