def on_click(self): """Indexes FAQ values for the Search Engine""" db_name = self.db_input.get().strip() faq_table_name = self.table_input.get().strip() if db_name and faq_table_name: data_storage = Database(f"{db_name}.db") faq_df = data_storage.get_dataframe(table=f"{faq_table_name}") faq_se = FAQSearchEngine() faq_se.create_index( corpus=faq_df, db=data_storage, table_name=f"{faq_table_name}_doc_term_matrix", ) data_storage.close_connection() self.close_win() else: print("Error: But for real though, fields must not be empty.")
def main(): # Parse cli arguments parser = argparse.ArgumentParser( description="""This is the script responsible for fetching the Rucio documentation through GitHub""" ) required = parser.add_argument_group("required arguments") optional = parser.add_argument_group("optional arguments") required.add_argument( "-t", "--token", help="GitHub api token to be used for the GET requests while fetching", required=True, ) optional.add_argument( "-o", "--output_db", default="docs_input_data", help="Output .db file where the data is stored (default is docs_input_data)", ) optional.add_argument( "--documentation_table", default="docs", help="Name of the table where we will store the documentation (default is docs)", ) args = parser.parse_args() db_name = args.output_db token = args.token docs_table = args.documentation_table # IssueFetcher data_storage = Database(f"{db_name}.db") fetcher = FetcherFactory.get_fetcher("Rucio Documentation") docs_df = fetcher.fetch(api_token=token) fetcher.save(db=data_storage, docs_table_name=docs_table) print(f"Data saved on {db_name}.db") print("Sample docs:") print(docs_df.head()) data_storage.close_connection()
def fetch_faq_data(): """Creates FAQ table and populates it with data in faq.json""" # create faq table print(f"Creating faq table in data_storage.db") data_storage = Database("data_storage.db") data_storage.create_faq_table() # load faq data with open(DATA_DIR + "faq.json") as json_file: data = json.load(json_file) # insert data to db print(f"Inserting data from faq.json file...") for faq in data: data_storage.insert_faq(faq) data_storage.close_connection()
def insert_faq_to_db( self, db_name, faq_table_name, question, answer, author, keywords ): """Inserts FAQ values to db""" # prepare data_storage data_storage = Database(f"{db_name}.db") # create table if not exists tables_in_db = list([table[0] for table in data_storage.get_tables()]) if faq_table_name not in tables_in_db: print(f"Creating '{faq_table_name}' table in {db_name}.db") data_storage.create_faq_table(table_name=f"{faq_table_name}") # insert row faq_obj = FAQ( question=question, answer=answer, author=author, keywords=keywords ) data_storage.insert_faq(faq_obj, table_name=faq_table_name) print(f"FAQ object inserted in '{faq_table_name}' table on {db_name}.db!") data_storage.close_connection()
def __init__(self, model=None, db_name="data_storage", num_answers_to_predict=3): self.model = "distilbert-base-cased-distilled-squad" if model: check_model_availability(model) self.model = model self.db_name = db_name # better if just CPU for inference gpu = 0 if torch.cuda.is_available() else -1 self.answer_detector = AnswerDetector( model=self.model, device=gpu, num_answers_to_predict=num_answers_to_predict ) data_storage = Database(f"{self.db_name}.db") faq_se, docs_se, question_se = setup_search_engines(db=data_storage) self.qa_interface = QAInterface( detector=self.answer_detector, question_engine=question_se, faq_engine=faq_se, docs_engine=docs_se, ) # thread that innits donkeybot instance wont used db again data_storage.close_connection()
def _store_answers(self, answers): # a different thread is running each time so new connection to db is needed # could use 'sqlite3.connect('your.db', check_same_thread=False)' but need to do my own synchronization data_storage = Database(f"{self.db_name}.db") for answer in answers: data_storage.insert_answer(answer) data_storage.close_connection() return
def main(): # Parse cli arguments parser = argparse.ArgumentParser( description= """This is the script responsible for parsing GitHub issue comments""") required = parser.add_argument_group("required arguments") optional = parser.add_argument_group("optional arguments") required.add_argument( "-i", "--input_db", help="Input .db file name of the raw fetched issue comments", required=True, ) optional.add_argument( "-o", "--output_db", default="data_storage", help= "Output .db file name of the parsed issues (default is data_storage)", required=True, ) optional.add_argument( "--issue_comments_table", default="issue_comments", help= "Name of the table where we will store the parsed issues and of the table where the raw issue comments are stored (default is issue_comments)", ) args = parser.parse_args() input_db = args.input_db output_db = args.output_db issue_comments_table = args.issue_comments_table # input raw_issue_comments_data = Database(f"{input_db}.db").get_dataframe( issue_comments_table) # output data_storage = Database(f"{output_db}.db") data_storage.create_issue_comments_table(table_name=issue_comments_table) # EmailParser print("Let's create an IssueCommentsParser.") parser = ParserFactory.get_parser("Issue Comment") parser.parse_dataframe( raw_issue_comments_data, db=data_storage, issue_comments_table=issue_comments_table, ) print(f"Data from {input_db}.db parsed and saved on {output_db}.db") data_storage.close_connection()
def __init__(self, model=None, db_name="data_storage", num_answers_inf=1): self.model = "distilbert-base-cased-distilled-squad" if model: check_model_availability(model) self.model = model gpu = 0 if torch.cuda.is_available() else -1 self.answer_detector = AnswerDetector( model=self.model, device=gpu, num_answers_to_predict=num_answers_inf) data_storage = Database(f"{db_name}.db") faq_se, docs_se, question_se = setup_search_engines(db=data_storage) self.qa_interface = QAInterface(detector=self.answer_detector, question_engine=question_se, faq_engine=faq_se, docs_engine=docs_se)
def main(): # Parse cli arguments parser = argparse.ArgumentParser( description="""This is the script responsible for parsing the emails""" ) required = parser.add_argument_group("required arguments") optional = parser.add_argument_group("optional arguments") required.add_argument("-i", "--input_db", help="Input .db file name of the raw emails", required=True) optional.add_argument( "-o", "--output_db", default="data_storage", help= "Output .db file name of the parsed emails (default is data_storage)", required=True, ) optional.add_argument( "--emails_table", default="emails", help= "Name of the table where we will store the parsed emails and of the table where the raw emails are stored (default is emails)", ) args = parser.parse_args() input_db = args.input_db output_db = args.output_db emails_table = args.emails_table # input raw_emails_data = Database(f"{input_db}.db").get_dataframe(emails_table) # output data_storage = Database(f"{output_db}.db") data_storage.create_emails_table(table_name=emails_table) # EmailParser print("Let's create an EmailParser") parser = ParserFactory.get_parser("Email") parser.parse_dataframe(raw_emails_data, db=data_storage, emails_table_name=emails_table) print(f"Data from {input_db}.db parsed and saved on {output_db}.db") data_storage.close_connection()
def test_db(): db = Database("test.db", "test_table") db.create_issue_comments_table("test_table") yield db db.close_connection() os.remove(config.DATA_DIR + "test.db")
def main(): # Parse cli arguments parser = argparse.ArgumentParser( description= """With this script you can query with the Search Engine module and get top-k results.""" ) required = parser.add_argument_group("required arguments") optional = parser.add_argument_group("optional arguments") required.add_argument( "-q", "--query", help="What you want to query.", required=True, ) required.add_argument( "-k", "--top_k", type=check_positive, help="Number of documents that'll be retrieved.", required=True, ) optional.add_argument( "-mq", "--match_questions", type=str2bool, nargs="?", # 0 or 1 argument const=True, default=False, help="Match query to similar questions.", ) optional.add_argument( "-md", "--match_docs", type=str2bool, nargs="?", const=True, default=False, help="Match query to similar documents.", ) optional.add_argument( "--docs_index", default="rucio_doc_term_matrix", help= "Name of documentation index column. (default is rucio_doc_term_matrix)", ) optional.add_argument( "--docs_original_table", default="docs", help= "Name of the original table for the documentation. (default is docs)", ) optional.add_argument( "--question_index", default="questions_doc_term_matrix", help= "Name of questions index column. (default is questions_doc_term_matrix)", ) optional.add_argument( "--questions_original_table", default="questions", help= "Name of the original table for the questions. (default is questions)", ) optional.add_argument( "-db", "--db_name", default="data_storage", help= "Name of database where indexes are stored. (default is data_storage)", ) args = parser.parse_args() db_name = args.db_name query = args.query top_k = int(args.top_k) if not (args.match_questions or args.match_docs): parser.error( "No index to search requested, add -mq/--match_questions or -md/--match_docs" ) match_questions = args.match_questions match_docs = args.match_docs docs_idx_name = args.docs_index docs_original_table = args.docs_original_table question_original_table = args.questions_original_table question_idx_name = args.question_index data_storage = Database(f"{db_name}.db") # load SE's try: docs_se = SearchEngine() docs_se.load_index( db=data_storage, table_name=docs_idx_name, original_table=docs_original_table, ) q_se = QuestionSearchEngine() q_se.load_index( db=data_storage, table_name=question_idx_name, original_table=question_original_table, ) data_storage.close_connection() if match_docs: docs_results = docs_se.search(query, top_k) print(f"\nTop-{top_k} retrieved documentation:") print(docs_results[["doc_id", "question", "name", "context"]]) if match_questions: question_results = q_se.search(query, top_k) print(f"\nTop-{top_k} retrieved past questions:") print(question_results[[ "question_id", "query", "question", "context" ]]) except Exception as _e: print("Error : ", end="") sys.exit(_e)
def main(): # Parse cli arguments parser = argparse.ArgumentParser( description="""Use this script to ask DonkeyBot!""") optional = parser.add_argument_group("optional arguments") optional.add_argument( "-m", "--model", default="distilbert-base-cased-distilled-squad", help= "BERT/DistilBERT model used to inference answers. (default is distilbert-base-cased-distilled-squad)", ) optional.add_argument( "-db", "--db_name", default="data_storage", help= "Name of database where all data is stored. (default is data_storage)", ) optional.add_argument( "-s", "--store_answers", type=str2bool, nargs="?", const=True, default=False, help= "Store the answers on the '--answers_table' table. (default is False)", ) optional.add_argument( "-n", "--num_answers_predicted_per_document", default=3, help="Number of answers predicted per document. (default is 3)", ) optional.add_argument( "--answers_table", default="answers", help="Name of the answers table. (default is 'answers')", ) args = parser.parse_args() db_name = args.db_name model = args.model answers_table = args.answers_table store_answers = args.store_answers num_answers_inf = int(args.num_answers_predicted_per_document) check_model_availability(model) # prepare data_storage data_storage = Database(f"{db_name}.db") # check for the answers table tables_in_db = list([table[0] for table in data_storage.get_tables()]) if answers_table not in tables_in_db: print(f"Creating '{answers_table}' table in {db_name}.db") data_storage.create_answers_table(table_name=f"{answers_table}") # load answer detector print("Loading AnswerDetector...") gpu = 0 if torch.cuda.is_available() else -1 answer_detector = AnswerDetector(model=model, device=gpu, num_answers_to_predict=num_answers_inf) # load search engines faq_se, docs_se, question_se = setup_search_engines(db=data_storage) # load interface qa_interface = QAInterface( detector=answer_detector, question_engine=question_se, faq_engine=faq_se, docs_engine=docs_se, ) # Main Loop print("DonkeyBot ready to be asked!") try: while True: print("\nCTRL+C to exit donkeybot") query = str(input("ask question: ")) top_k = int(input("how many answers: ")) start_time = time.time() answers = qa_interface.get_answers(query, top_k=top_k) print( f"Total inference time: {round(time.time() - start_time, 2)} seconds" ) print_answers(answers) if store_answers: for answer in answers: data_storage.insert_answer(answer, table_name=f"{answers_table}") except KeyboardInterrupt: data_storage.close_connection() sys.exit("\nExiting...")
def test_db(): db = Database("db_for_tests.db") yield db db.close_connection()
def main(): # Parse cli arguments parser = argparse.ArgumentParser( description="""This is the script responsible for fetching GitHub issues and comments""" ) required = parser.add_argument_group("required arguments") optional = parser.add_argument_group("optional arguments") required.add_argument( "-r", "--repo", help="Name of the GitHub repository we are fetching from. Format `user/repo`", required=True, ) required.add_argument( "-t", "--token", help="GitHub api token to be used for the GET requests while fetching", required=True, ) optional.add_argument( "-o", "--output_db", default="issues_input_data", help="Output .db file where the data is stored (default is issues_input_data)", ) optional.add_argument( "--max_pages", default=201, type=int, help="Maximum number of pages we will request through GitHubs api (default is 201)", required=False, ) optional.add_argument( "--issues_table", default="issues", help="Name of the table where we will store the issues (default is issues)", ) optional.add_argument( "--comments_table", default="issue_comments", help="Name of the table where we will store the comments (default is issue_comments)", ) args = parser.parse_args() db_name = args.output_db repository = args.repo token = args.token issues_table = args.issues_table comments_table = args.comments_table max_pages = args.max_pages # IssueFetcher data_storage = Database(f"{db_name}.db") fetcher = FetcherFactory.get_fetcher("Issue") (issues_df, comments_df) = fetcher.fetch( repo=repository, api_token=token, max_pages=max_pages ) fetcher.save( db=data_storage, issues_table_name=issues_table, comments_table_name=comments_table, ) print(f"Raw issues data stored on {db_name}.db") print("Sample:") print("Issues") print(issues_df.head()) print("Comments") print(comments_df.head()) data_storage.close_connection()
def main(): # Parse cli arguments parser = argparse.ArgumentParser( description="""Run this script to detect and save questions originating from GitHub issues""" ) required = parser.add_argument_group("required arguments") optional = parser.add_argument_group("optional arguments") required.add_argument("-db", "--db_name", help="Database name of our storage") optional.add_argument( "--issues_table", default="issues", help="Name given to the table holding the issues. (default is issues)", ) optional.add_argument( "--questions_table", default="questions", help="Name given to the table holding the questions. (default is questions)", ) args = parser.parse_args() db_name = args.db_name issues_table = args.issues_table questions_table = args.questions_table data_storage = Database(f"{db_name}.db") tables_in_db = list([table[0] for table in data_storage.get_tables()]) assert issues_table in tables_in_db if questions_table not in tables_in_db: print(f"Creating '{questions_table}' table in {db_name}.db") data_storage.create_question_table(table_name=questions_table) issues_df = data_storage.get_dataframe(issues_table) qd = QuestionDetector("issue") print("Detecting questions in issues that have comments...") issues_with_questions = 0 total_questions = 0 for i in tqdm(range(len(issues_df.index))): text = str(issues_df.clean_body.values[i]) issue_id = int(issues_df.issue_id.values[i]) questions_detected = qd.detect(text) if not questions_detected: continue else: issues_with_questions += 1 for question in questions_detected: total_questions += 1 question.set_origin_id(issue_id) # make sure to find the context for each question question.find_context_from_table(data_storage, table_name=issues_table) if question.context == "": continue else: data_storage.insert_question(question, table_name=questions_table) print(f"Type of the question objects : {type(question)}") print(f"Total questions detected: {total_questions}") print(f"Number of issues with questions: {issues_with_questions}") data_storage.close_connection()
def main(): # Parse cli arguments parser = argparse.ArgumentParser( description= """Run this script to detect and save questions originating from emails""" ) required = parser.add_argument_group("required arguments") optional = parser.add_argument_group("optional arguments") required.add_argument("-db", "--db_name", help="Database name of our storage") optional.add_argument( "--emails_table", default="emails", help="Name given to the table holding the emails. (default is emails)", ) optional.add_argument( "--questions_table", default="questions", help= "Name given to the table holding the questions. (default is questions)", ) args = parser.parse_args() db_name = args.db_name emails_table = args.emails_table questions_table = args.questions_table data_storage = Database(f"{db_name}.db") tables_in_db = list([table[0] for table in data_storage.get_tables()]) assert emails_table in tables_in_db if questions_table not in tables_in_db: print(f"Creating '{questions_table}' table in {db_name}.db") data_storage.create_question_table(table_name=questions_table) emails_df = data_storage.get_dataframe(emails_table) # only keep emails that are part of a conversation to search for Questions in them conv_df = (emails_df[emails_df["conversation_id"].notnull()].sort_values( by=["conversation_id", "email_date"]).reset_index(drop=True)) qd = QuestionDetector("email") print("Detecting questions in emails that are part of conversations...") emails_with_questions = 0 total_questions = 0 for i in tqdm(range(len(conv_df.index))): text = str(conv_df.clean_body.values[i]) email_id = int(conv_df.email_id.values[i]) questions_detected = qd.detect(text) if not questions_detected: continue else: emails_with_questions += 1 for question in questions_detected: total_questions += 1 question.set_origin_id(email_id) # make sure to find the context for each question question.find_context_from_table(data_storage, table_name=emails_table) if question.context == "": continue else: data_storage.insert_question(question, table_name=questions_table) print(f"Type of the question objects : {type(question)}") print(f"Total questions detected: {total_questions}") print(f"Number of emails with questions: {emails_with_questions}") data_storage.close_connection()
def main(): # Parse cli arguments parser = argparse.ArgumentParser( description="""This script indexes our data for SearchEngine and QuestionSearchEngine.""" ) optional = parser.add_argument_group("optional arguments") optional.add_argument( "-db", "--db_name", default="data_storage", help="Database name of our storage. (default is data_storage)", ) optional.add_argument( "--documentation_table", default="docs", help="Name of the table where the documentation is stored (default is docs)", ) optional.add_argument( "--questions_table", default="questions", help="Name given to the table holding the questions. (default is questions)", ) optional.add_argument( "--faq_table", default="faq", help="Name given to the table holding the FAQ. (default is faq)", ) args = parser.parse_args() db_name = args.db_name docs_table = args.documentation_table questions_table = args.questions_table faq_table = args.faq_table data_storage = Database(f"{db_name}.db") # Documentation SearchEngine docs_se = SearchEngine() docs_df = data_storage.get_dataframe(docs_table) # let's not index the release-notes in this version of the bot # this code also exists is load_index() for rucio documents docs_df = docs_df[docs_df["doc_type"] != "release_notes"] print("Indexing Rucio documentation for the SearchEngine...") docs_se.create_index( corpus=docs_df, db=data_storage, table_name="rucio_doc_term_matrix" ) # QuestionSearchEngine questions_se = QuestionSearchEngine() questions_df = data_storage.get_dataframe(questions_table) print("Indexing Questions for the QuestionSearchEngine...") questions_se.create_index( corpus=questions_df, db=data_storage, table_name=f"{questions_table}_doc_term_matrix", ) # FAQSearchEngine faq_se = FAQSearchEngine() faq_df = data_storage.get_dataframe(faq_table) print("Indexing FAQ for the FAQSearchEngine...") faq_se.create_index( corpus=faq_df, db=data_storage, table_name=f"{faq_table}_doc_term_matrix" )
def update_label(self, answer_id, label): data_storage = Database(f"{self.db_name}.db") assert label in (0, 1) data_storage.update_label(answer_id, label) data_storage.close_connection() return