def __init__(self): LOG_FILE = "logs/queryProcessor.log" self.__logger = logging.getLogger("queryProcessor") file_handler = logging.FileHandler(LOG_FILE) self.__logger.addHandler(file_handler) self.__config = ConfigurationParser() self.__database = DatabaseFactory().getDatabase( self.__config.getEngineConfig("SmartPDFAssistant")['database'])
def __init__(self): # This makes class singelton __metaclass__ = Singleton # Instating Logger LOG_FILE = "logs/database.log" self.__logger = logging.getLogger("mongo") file_handler = logging.FileHandler(LOG_FILE) self.__logger.addHandler(file_handler) self.__config = ConfigurationParser() # Database server configurations self.__port = int(self.__config.getDatabaseConfig()['port']) self.__host = self.__config.getDatabaseConfig()['host'] self.__maxDelay = 10 self.__connect()
class PDFProcessor: def __init__(self, pdfname): # args = parser.parse_args() self.__pdfname = pdfname def processPdf(self, pdfname): # parser = argparse.ArgumentParser(description="Enter document filename") # parser.add_argument('doc', help='document filename') # args = parser.parse_args() pdf_path = 'uploads/' + pdfname # input document filename text_path = self.__extractText(pdf_path) processed_pdf = self.__createChunks(text_path) self.__addToDatabase(processed_pdf) self.__addToElasticServer(processed_pdf) def __extractText(self, pdf_path): text_path = 'uploads/' + pdf_path.rsplit('/', 1)[-1][:-4] + '-text.txt' out_ptr = open(text_path, "w") for page in self.__extractTextByPage(pdf_path): out_ptr.write(page) out_ptr.close() return text_path def __extractTextByPage(self, pdf_path): with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): resource_manager = PDFResourceManager() fake_file_handle = StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) page_interpreter.process_page(page) text = fake_file_handle.getvalue().encode('ascii', 'ignore').decode() yield text # close open handles converter.close() fake_file_handle.close() def __createChunks(self, text_path): text_file_ptr = open(text_path) chunks = [] text = text_file_ptr.read() self.__generateTitles(chunks, text) self.__generatePara(chunks, text) self.__generateKeywords(chunks) return chunks def __generateTitles(self, chunks, text): table_of_content = re.findall("(?!\s)[\w\s&\-\,\_\?\&\'\(\)\:\/]+\.\.+\s\d+", text) for title in table_of_content: temp = [title.split(".")[0]] chunks.append(temp) def __generatePara(self, chunks, text): chunk_len = len(chunks) content = text.split(chunks[chunk_len - 1][0], 1) for i in range(0, chunk_len): temp = chunks[i] if i != chunk_len - 1: para = re.search( re.escape(chunks[i][0]) + "[\w\s\W]+" + re.escape(chunks[i + 1][0]), content[1], flags=re.IGNORECASE) if para: para = "{}".format(para.group(0)) paragraph = para.replace(chunks[i][0], '', 1) paragraph += para.replace(chunks[i + 1][0], '') else: para = re.search(re.escape(chunks[i][0]) + "[\w\s\W]+", content[1], flags=re.IGNORECASE) if para: para = "{}".format(para.group(0)) paragraph = para.replace(chunks[i][0], '') temp.append("".join(paragraph)) def __generateKeywords(self, chunks): r = Rake() for chunk in chunks: text = chunk[1] r.extract_keywords_from_text(text) chunk.append(r.get_ranked_phrases_with_scores()) # for chunk in chunks: # print(chunk[0]) # print(chunk[1]) # print(chunk[2]) # print("\n**\n") def __addToDatabase(self, chunks): self.__config = ConfigurationParser() self.__database = DatabaseFactory().getDatabase(self.__config.getEngineConfig("SmartPDFAssistant")['database']) for chunk in chunks: self.__database.insertInto("PDFAssistant", "ProcessedPDF",{'Date': datetime.now().strftime("%d/%m/%Y %H:%M:%S"), 'Title': chunk[0],'Text': chunk[1], 'Keywords': chunk[2]}) def __addToElasticServer(self, chunks): es = ElasticServer() pdfName = str(self.__pdfname).lower() print("es.index : ", es.createIndex("esindex")) es.store_records("esindex", chunks)
def __addToDatabase(self, chunks): self.__config = ConfigurationParser() self.__database = DatabaseFactory().getDatabase(self.__config.getEngineConfig("SmartPDFAssistant")['database']) for chunk in chunks: self.__database.insertInto("PDFAssistant", "ProcessedPDF",{'Date': datetime.now().strftime("%d/%m/%Y %H:%M:%S"), 'Title': chunk[0],'Text': chunk[1], 'Keywords': chunk[2]})
class QueryProcessor(Engine): def __init__(self): LOG_FILE = "logs/queryProcessor.log" self.__logger = logging.getLogger("queryProcessor") file_handler = logging.FileHandler(LOG_FILE) self.__logger.addHandler(file_handler) self.__config = ConfigurationParser() self.__database = DatabaseFactory().getDatabase( self.__config.getEngineConfig("SmartPDFAssistant")['database']) def searchInElasticServer(self, query): es = ElasticServer() selected_titles = es.get_shard("esindex", query) return selected_titles def train(self, pdfname): print("\n pdfname : " + pdfname) processed = PDFProcessor(pdfname) processed.processPdf(pdfname) def merge(self, list1, list2): merged_list = tuple(zip(list1, list2)) return merged_list # Will take the query and return the output def predict(self, query, wmdmodel): # Logging the query self.__logger.info("[{}] : Received Query : {}".format( datetime.now().strftime("%d/%m/%Y %H:%M:%S"), query)) # Applying RAKE on query. r = Rake() query_keywords = r.extract_keywords_from_text(query) query_ranked_phrase = r.get_ranked_phrases() query = ''.join(query_ranked_phrase) # Creating objects wmd = WMD() qp = QueryProcessor() # Searching for titles matched with query in elastic server selected_titles = qp.searchInElasticServer(query) selected_titles_len = len(selected_titles) if not selected_titles: return "No results found!" # Using WMD to get vector distance between query and each of the selected titles wmdresponse = wmd.getWMDResponse(wmdmodel, query, selected_titles, selected_titles_len) response = self.merge(selected_titles, wmdresponse) # Sorting response according to vector distance (ascending) response = sorted(response, key=itemgetter(1)) # Retrieving preprocessed data from DB db_data = self.__database.getFrom("PDFAssistant", "ProcessedPDF", '') # Selecting title having minimum vector distance with input query for i in range(len(db_data['Title'])): if response[0][0] == db_data['Title'][i]: break finalresponse = db_data['Text'][i] if not finalresponse: return "No results found! Please refine your search" # Logging the response self.__logger.info("[{}] : Answer Sent : {}".format( datetime.now().strftime("%d/%m/%Y %H:%M:%S"), finalresponse)) self.__logger.info("--" * 30) # Insertion into DB self.__database.insertInto( "PDFAssistant", "QueryHistory", { 'Date': datetime.now().strftime("%d/%m/%Y %H:%M:%S"), 'Query': query, 'Answer': finalresponse }) return finalresponse
import os import logging from datetime import datetime from flask import Flask,request, render_template, url_for, redirect, jsonify from Engines.QueryProcessor import QueryProcessor from ConfigParser.ConfigParser import ConfigurationParser from Factory.DatabaseFactory import DatabaseFactory from Processor.WMD import WMD app = Flask(__name__, static_url_path='', static_folder='templates', template_folder='templates') config = ConfigurationParser() db = DatabaseFactory().getDatabase(config.getEngineConfig("SmartPDFAssistant")['database']) portNumber = int(config.getServerConfig()['port']) queryProcessor = QueryProcessor() wmd = WMD() # Loading Glove vectors wmdmodel = wmd.load() LOG_FILE = "logs/server.log" logger = logging.getLogger("server") file_handler = logging.FileHandler(LOG_FILE) logger.addHandler(file_handler) logger.info("[{}] : Successfully hosted server at {}".format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"), str(portNumber))) # Route for home page @app.route('/')
class Mongo(Database): def __init__(self): # This makes class singelton __metaclass__ = Singleton # Instating Logger LOG_FILE = "logs/database.log" self.__logger = logging.getLogger("mongo") file_handler = logging.FileHandler(LOG_FILE) self.__logger.addHandler(file_handler) self.__config = ConfigurationParser() # Database server configurations self.__port = int(self.__config.getDatabaseConfig()['port']) self.__host = self.__config.getDatabaseConfig()['host'] self.__maxDelay = 10 self.__connect() """ Database server connection code """ def __connect(self): try: self.__client = MongoClient( self.__host, self.__port, serverSelectionTimeoutMS=self.__maxDelay) self.__client.server_info() self.__logger.info( "[{}] : Successfully connected to database server at {}". format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"), self.__host + ":" + str(self.__port))) self.__logger.info("==" * 30) except: logging.error( "[{}] : Could not connect to database server. Database connection is down." .format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) raise ConnectionError( "Could not connect to database server. Database connection is down." ) #------------------------------------------------------------------------------------------- """ Selection of a Database """ def getDB(self, db): try: return self.__client[db] except: self.__logger.error( "[{}] : Cannot find / create database {}".format( datetime.now().strftime("%d/%m/%Y %H:%M:%S"), db)) raise ValueError("Error finding / creating database {}".format(db)) """ Returns list of all available DBs """ def getAllDB(self): return self.__client.list_database_names() """ Get a collection """ def getCollection(self, dbName, collectionName): try: db = self.getDB(dbName) collection = db[collectionName] return collection except: self.__logger.error( "[{}] : Cannot find / create collection {}".format( datetime.now().strftime("%d/%m/%Y %H:%M:%S"), collectionName)) raise ValueError("Error finding / creating collection {}".format( collectionName)) """ Returns list of all collections """ def getAllCollections(self, dbName=None): collections = [] if isinstance(dbName, str): db = self.getDB(dbName) collections = db.list_collection_names() else: dbList = self.getAllDB() for db in dbList: collections.extend(self.getDB(db).list_collection_names()) return collections """ Insert a row in a table """ def insertInto(self, dbName, collectionName, data): collection = self.getCollection(dbName, collectionName) if isinstance(data, list): collection.insert_many(data) self.__logger.info( "[{}] : Inserted {} rows into {} collection of {}".format( datetime.now().strftime("%d/%m/%Y %H:%M:%S"), str(len(data)), collectionName, dbName)) elif isinstance(data, dict): collection.insert_one(data) self.__logger.info( "[{}] : Inserted a row into {} collection of {} database with values {}" .format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"), collectionName, dbName, str(data))) elif isinstance(data, pd.DataFrame): data = list(data.to_dict(orient='index').values()) collection.insert_many(data) self.__logger.info( "[{}] : Inserted {} rows into {} collection of {}".format( datetime.now().strftime("%d/%m/%Y %H:%M:%S"), str(len(data)), collectionName, dbName)) else: self.__logger.info( "[{}] : Data type not supported for insertion. Expected list of dict/ dict, Found {}" .format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"), str(type(data)))) raise TypeError( "Data type not supported for insertion. Expected list of dict/ dict, Found {}" .format(str(type(data)))) """ Get data of a table """ def getFrom(self, dbName, collectionName, top=10, data_type='dataframe', filter_criteria=None): try: collection = self.getCollection(dbName, collectionName) data = collection.find({}) if data_type == 'dataframe': data = pd.DataFrame(data) elif data_type == 'models': json_data = {} for i in data: json_data = i data = json_data return data except: self.__logger.info( "[{}] : Error in Querying {} collection of {} database".format( datetime.now().strftime("%d/%m/%Y %H:%M:%S"), collectionName, dbName)) raise ValueError("No Data found with given name")