def create_answers(deck_id, query): decks_collection = db["decks"] slides_collection = db["slides"] decks = decks_collection.find({"_id": deck_id}) target_deck = decks[0] author_id = target_deck["user"] author_slides = slides_collection.find({"user": author_id}) df = pd.DataFrame() for slide in author_slides: revision = slide["revisions"][-1] usages = revision["usage"] for usage in usages: if usage["id"] == deck_id: content = html2text.html2text(revision["content"]) paragraphs = content.split('\n\n') df = df.append( { "date": revision["timestamp"], "title": revision["title"], "category": "Infromation", "link": "", "abstract": "", "paragraphs": paragraphs, "revision_id": revision["id"], "slide_id": slide["_id"] }, ignore_index=True) break download_model(model='bert-squad_1.1', dir='./models') # df = filter_paragraphs(df) cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib', max_df=0.95, min_df=3) cdqa_pipeline.fit_retriever(df) predictions = cdqa_pipeline.predict( query, n_predictions=5) #retriever_score_weight=0.99 answers = [] i = 1 for prediction in predictions: slide_id = df.loc[df["title"] == prediction[1]].iloc[0]['slide_id'] revision_id = df.loc[df["title"] == prediction[1]].iloc[0]['revision_id'] answers.append({ "slide_id": int(slide_id), "revision_id": int(revision_id), "answer": prediction[0], "title": prediction[1], "paragraph": prediction[2], "score": prediction[3] }) i += 1 return answers
def question(text, query): print(text) test = [] for i in sent_tokenize(text): if len(i) > 2: test.append(i) n = 4 # using list comprehension final = [test[i * n:(i + 1) * n] for i in range((len(test) + n - 1) // n)] title_s = [] for j in range(len(final)): title_s.append(f'Title{j}') data = [title_s, final] df3 = pd.DataFrame(data=data) df3 = df3.transpose() df3.columns = ['title', 'paragraphs'] print(df3) #st.text('Hold on this will take some time') from ast import literal_eval from cdqa.utils.filters import filter_paragraphs from cdqa.utils.download import download_model, download_bnpp_data from cdqa.pipeline.cdqa_sklearn import QAPipeline # Download data and models #download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/') #download_model(model='bert-squad_1.1', dir='./models') # Loading data and filtering / preprocessing the documents df = pd.read_csv( 'D:/devjams/Machine-Learning-Web-Apps-master/NLPIffy_NLP_Based_SpaCy_Flask_App&_API/cdQA/data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval}) df = filter_paragraphs(df) #st.text('Please Wait. We are looking for the answer to your question') # Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1 cdqa_pipeline = QAPipeline( reader= 'D:/devjams/Machine-Learning-Web-Apps-master/NLPIffy_NLP_Based_SpaCy_Flask_App&_API/bert_qa_vGPU-sklearn.joblib' ) # Fitting the retriever to the list of documents in the dataframe cdqa_pipeline.fit_retriever(df3) print(query) #st.text('Almost done.......') #query = 'Intellectual Property Rights' try: prediction = cdqa_pipeline.predict(query) except Exception as e: print(e) #st.text(prediction[2]) return prediction[2]
def find_answer(question): # Set your path to pdf directory df = pdf_converter(directory_path='pdf_folder/') cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib') cdqa_pipeline.fit_retriever(df) query = question + '?' prediction = cdqa_pipeline.predict(query) # print('query: {}\n'.format(query)) # print('answer: {}\n'.format(prediction[0])) # print('title: {}\n'.format(prediction[1])) # print('paragraph: {}\n'.format(prediction[2])) return prediction[0]
def search_view(request): if request.POST: question = request.POST.get('question') for idx, url in enumerate( search(question, tld="com", num=10, stop=3, pause=2)): crawl_result(url, idx) # change path to pdfs folder df = pdf_converter(directory_path='/path/to/pdfs') cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib') cdqa_pipeline.fit_retriever(df) prediction = cdqa_pipeline.predict(question) data = {'answer': prediction[0]} return JsonResponse(data) return render(request, 'search.html')
def execute_pipeline(query): download_bnpp_data('./data/bnpp_newsroom_v1.1/') download_model('bert-squad_1.1', dir='./models') df = pd.read_csv('./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval}) df = filter_paragraphs(df) cdqa_pipeline = QAPipeline(reader='models/bert_qa_vCPU-sklearn.joblib') cdqa_pipeline.fit_retriever(X=df) prediction = cdqa_pipeline.predict(X=query) result = (prediction[0], prediction[1]) return result
def execute_pipeline(query): df = pd.read_csv('data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval}) df = filter_paragraphs(df) cdqa_pipeline = QAPipeline( reader='models/bert_qa_vCPU-sklearn.joblib') cdqa_pipeline.fit(X=df) cdqa_pipeline.reader.output_dir = None prediction = cdqa_pipeline.predict(X=query) result = (prediction[0], prediction[1]) return result
def execute_pipeline(query, n_predictions=None): download_bnpp_data("./data/bnpp_newsroom_v1.1/") download_model("bert-squad_1.1", dir="./models") df = pd.read_csv( "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv", converters={"paragraphs": literal_eval}, ) df = filter_paragraphs(df) cdqa_pipeline = QAPipeline(reader="models/bert_qa_vCPU-sklearn.joblib") cdqa_pipeline.fit_retriever(X=df) if n_predictions is not None: predictions = cdqa_pipeline.predict(X=query, n_predictions=n_predictions) result = [] for answer, title, paragraph in predictions: prediction = (answer, title) result.append(prediction) return result else: prediction = cdqa_pipeline.predict(X=query) result = (prediction[0], prediction[1]) return result
def ask(): name = request.form['btn-input'] #print(name) f = open('current.txt') file1 = f.read().rstrip() f.close() cdqa_pipeline = QAPipeline(reader='models/bert_qa_vCPU-sklearn.joblib') row = file_open(file1) df = pd.DataFrame(row) df = df.T df.columns = ['title', 'paragraphs'] #print(df.head()) # Fitting the retriever to the list of documents in the dataframe cdqa_pipeline.fit_retriever(df) prediction = cdqa_pipeline.predict(name) ret = [name, prediction[0], prediction[1], prediction[2]] speech = ret[1] + "\n\n Related Paragraph" + ret[3] print('This is error output', speech) #return speech return render_template('index.html', value1=name, value2=speech)
import pandas as pd from ast import literal_eval from cdqa.utils.filters import filter_paragraphs from cdqa.utils.download import download_model, download_bnpp_data from cdqa.pipeline.cdqa_sklearn import QAPipeline # Download data and models #download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/') #download_model(model='bert-squad_1.1', dir='./models') # Loading data and filtering / preprocessing the documents df = pd.read_csv('data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval}) df = filter_paragraphs(df) # Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1 cdqa_pipeline = QAPipeline(reader='models/bert_qa_vCPU-sklearn.joblib') # Fitting the retriever to the list of documents in the dataframe _ = cdqa_pipeline.fit_retriever(df) # Sending a question to the pipeline and getting prediction query = 'Since when does the Excellence Program of BNP Paribas exist?' prediction = cdqa_pipeline.predict(query) print('query: {}\n'.format(query)) print('answer: {}\n'.format(prediction[0])) print('title: {}\n'.format(prediction[1])) print('paragraph: {}\n'.format(prediction[2]))
download_squad(dir=directory) download_model('bert-squad_1.1', dir=directory) download_model('distilbert-squad_1.1', dir=directory) from ast import literal_eval from cdqa.utils.filters import filter_paragraphs from cdqa.pipeline.cdqa_sklearn import QAPipeline from nltk import tokenize def load_from_csv(file): df = pd.read_csv(file) df = df.rename(str.lower, axis='columns') df['paragraphs'] = df['paragraphs'].apply( lambda x: x.replace("'s", " " "s").replace("\n", " ")) df['paragraphs'] = df['paragraphs'].apply( lambda x: tokenize.sent_tokenize(x)) return df df = load_from_csv('./data/test.csv') #make sure bert_qa.joblib is the same directory (cdQA), if not move it here from data cdqa_pipeline = QAPipeline(reader='bert_qa.joblib') cdqa_pipeline.fit_retriever(df=df) querry = st.text_area('enter mssage', 'type') if st.button('analyze'): message = cdqa_pipeline.predict(query=querry, n_predictions=2) st.success(message)
continue print('0~8의 인덱스가 안나온 경우 \n'+df.loc[list(best_idx_scores.keys())[0]]['title'])#테스트를 위한 #if max(retriever_temp.predict(ETRI_POS_Tagging(query)).values())>max(retriever.predict(ETRI_POS_Tagging(query)).values()): pass #else: cdqa_pipeline.fit_retriever(df.loc[best_idx_scores.keys()].head(1)) if max(retriever.predict(POS_query).values())<1.5 and max(retriever_temp.predict(POS_query).values())<1.5: print(ETRI_wiki(query)) continue kor_query=ETRI_korBERT(' '.join(list(df.loc[best_idx_scores.keys()].head(1)['paragraphs'])[0]),query) #---------------------------------- # temp_prediction=retriever_temp.predict(kor_query) # print(para[max(temp_prediction)]) #---------------------------------- prediction=cdqa_pipeline.predict(kor_query) print('cdqa 유사도 수치 '+str(prediction[3])) print(prediction[2]) # print('{}\n\n{}\n\n{}\n\n'.format(df.loc[list(best_idx_scores.keys())[0]]['content'],ETRI_POS_Tagging(query),query)) #test=cdqa_pipeline.predict(query) #max(retriever_temp.predict(ETRI_POS_Tagging('지원방법')).values()),max(retriever.predict(ETRI_POS_Tagging('지원방법')).values()) #if max(retriever_temp.predict(ETRI_POS_Tagging(query)).values())<1.5 and max(retriever.predict(ETRI_POS_Tagging(query)).values())<1.5: # print(ETRI_wiki(query)) # print('유사도가 낮어:'+str(max(sentence_idx_scores.values())))#위키로 처리하면 될듯 # else: # prediction=cdqa_pipeline.predict(kor_query) # print('cdqa 유사도 수치 '+str(prediction[3]))
cdqa_pipeline.fit_retriever(X=df_X) #%% # Evaluate QnA system from cdqa.utils.evaluation import evaluate_pipeline evaluate_pipeline(cdqa_pipeline, 'cdqa-v1.1-tim_qna.json') # Standard pre trained model: {'exact_match': 0.0, 'f1': 5.025362668068075} # Fine-tuned model: {'exact_match': 0.0, 'f1': 5.684362620078064} #%% [markdown] # ### Inference #%% prediction = cdqa_pipeline.predict(X='what would be a good gymnastic strength training goal to have?') print('title: {}'.format(prediction[1])) print('paragraph: {}'.format(prediction[2])) print('answer: {}'.format(prediction[0])) #%% [markdown] # ### Findings: # * Fine-tuning results in worse performance # * F1 score is really low --> not a good QnA system # * Inferene is very slow on mediocre hardware # * Inference results are also bad # # ### Identified issues # * too many paragraphs --> initial choice on the document to find the answer is too hard with tf-idf # * too many paragraphs with too short text, bad for tf-idf and to predict the answer # * too slow hardware
question = configs['question'] data_directory = '/data/' models_directory = '/models/' # download_squad(dir = './' + data_directory) download_bnpp_data(dir = './' + data_directory) # download_model('distilbert-squad_1.1', dir = './' + models_directory) download_model('bert-squad_1.1', dir = './' + models_directory) df = pandas.read_csv(data_directory + '/bnpp_paribas/-??-.csv', converter = {'paragraphs': ast.literal_evl}) df = filter_paragraphs(df) cdqa_pipeline = QAPipeline(reader = models_directory + '/bert_qa/bert_qa.joblib') cdqa_pipeline.fit_retriever(q = df) # cdqa_pipeline.fit_reader('path to squad like dataset . json') prediction = cdqa_pipeline.predict(q = question, n_prediction = ?) # ? = predictions # cdqa_pipeline.dump_reader('path to save . joblib') # save reader model query = 'query: {}\n'.format(query), answer = 'answer: {}\n'.format(prediction[0]), title = 'title: {}\n'.format(prediction[1]), paragraph = 'paragraph: {}\n'.format(prediction[2]) result = query, answer, title, paragraph notify2.init('question answer') notif = notify2.Notification('qa', result) # notif.set_urgency(notify2.URGENCY_CRITICAL) notif.show() notif.set_timeout(10)