Example #1
0
    def __init__(self):
        LOG_FILE = "logs/queryProcessor.log"
        self.__logger = logging.getLogger("queryProcessor")
        file_handler = logging.FileHandler(LOG_FILE)
        self.__logger.addHandler(file_handler)

        self.__config = ConfigurationParser()
        self.__database = DatabaseFactory().getDatabase(
            self.__config.getEngineConfig("SmartPDFAssistant")['database'])
Example #2
0
    def __init__(self):

        # This makes class singelton
        __metaclass__ = Singleton

        # Instating Logger
        LOG_FILE = "logs/database.log"
        self.__logger = logging.getLogger("mongo")
        file_handler = logging.FileHandler(LOG_FILE)
        self.__logger.addHandler(file_handler)

        self.__config = ConfigurationParser()

        # Database server configurations
        self.__port = int(self.__config.getDatabaseConfig()['port'])
        self.__host = self.__config.getDatabaseConfig()['host']
        self.__maxDelay = 10

        self.__connect()
Example #3
0
class PDFProcessor:
    def __init__(self, pdfname):
        # args = parser.parse_args()
        self.__pdfname = pdfname

    def processPdf(self, pdfname):
        # parser = argparse.ArgumentParser(description="Enter document filename")
        # parser.add_argument('doc', help='document filename')
        # args = parser.parse_args()
        pdf_path = 'uploads/' + pdfname  # input document filename
        text_path = self.__extractText(pdf_path)
        processed_pdf = self.__createChunks(text_path)
        self.__addToDatabase(processed_pdf)
        self.__addToElasticServer(processed_pdf)

    def __extractText(self, pdf_path):
        text_path = 'uploads/' + pdf_path.rsplit('/', 1)[-1][:-4] + '-text.txt'
        out_ptr = open(text_path, "w")
        for page in self.__extractTextByPage(pdf_path):
            out_ptr.write(page)
        out_ptr.close()
        return text_path

    def __extractTextByPage(self, pdf_path):
        with open(pdf_path, 'rb') as fh:
            for page in PDFPage.get_pages(fh,
                                          caching=True,
                                          check_extractable=True):
                resource_manager = PDFResourceManager()
                fake_file_handle = StringIO()
                converter = TextConverter(resource_manager, fake_file_handle)
                page_interpreter = PDFPageInterpreter(resource_manager, converter)
                page_interpreter.process_page(page)

                text = fake_file_handle.getvalue().encode('ascii', 'ignore').decode()

                yield text
                # close open handles
                converter.close()
                fake_file_handle.close()

    def __createChunks(self, text_path):
        text_file_ptr = open(text_path)
        chunks = []
        text = text_file_ptr.read()
        self.__generateTitles(chunks, text)
        self.__generatePara(chunks, text)
        self.__generateKeywords(chunks)
        return chunks

    def __generateTitles(self, chunks, text):
        table_of_content = re.findall("(?!\s)[\w\s&\-\,\_\?\&\'\(\)\:\/]+\.\.+\s\d+", text)
        for title in table_of_content:
            temp = [title.split(".")[0]]
            chunks.append(temp)

    def __generatePara(self, chunks, text):
        chunk_len = len(chunks)

        content = text.split(chunks[chunk_len - 1][0], 1)

        for i in range(0, chunk_len):
            temp = chunks[i]
            if i != chunk_len - 1:
                para = re.search(
                    re.escape(chunks[i][0]) + "[\w\s\W]+" + re.escape(chunks[i + 1][0]),
                    content[1], flags=re.IGNORECASE)
                if para:
                    para = "{}".format(para.group(0))
                    paragraph = para.replace(chunks[i][0], '', 1)
                    paragraph += para.replace(chunks[i + 1][0], '')
            else:
                para = re.search(re.escape(chunks[i][0]) + "[\w\s\W]+", content[1], flags=re.IGNORECASE)
                if para:
                    para = "{}".format(para.group(0))
                    paragraph = para.replace(chunks[i][0], '')
            temp.append("".join(paragraph))

    def __generateKeywords(self, chunks):
        r = Rake()
        for chunk in chunks:
            text = chunk[1]
            r.extract_keywords_from_text(text)
            chunk.append(r.get_ranked_phrases_with_scores())
        # for chunk in chunks:
        #     print(chunk[0])
        #     print(chunk[1])
        #     print(chunk[2])
        #     print("\n**\n")

    def __addToDatabase(self, chunks):
        self.__config = ConfigurationParser()
        self.__database = DatabaseFactory().getDatabase(self.__config.getEngineConfig("SmartPDFAssistant")['database'])

        for chunk in chunks:
            self.__database.insertInto("PDFAssistant", "ProcessedPDF",{'Date': datetime.now().strftime("%d/%m/%Y %H:%M:%S"), 'Title': chunk[0],'Text': chunk[1], 'Keywords': chunk[2]})

    def __addToElasticServer(self, chunks):
        es = ElasticServer()
        pdfName = str(self.__pdfname).lower()
        print("es.index : ", es.createIndex("esindex"))
        es.store_records("esindex", chunks)
Example #4
0
    def __addToDatabase(self, chunks):
        self.__config = ConfigurationParser()
        self.__database = DatabaseFactory().getDatabase(self.__config.getEngineConfig("SmartPDFAssistant")['database'])

        for chunk in chunks:
            self.__database.insertInto("PDFAssistant", "ProcessedPDF",{'Date': datetime.now().strftime("%d/%m/%Y %H:%M:%S"), 'Title': chunk[0],'Text': chunk[1], 'Keywords': chunk[2]})
Example #5
0
class QueryProcessor(Engine):
    def __init__(self):
        LOG_FILE = "logs/queryProcessor.log"
        self.__logger = logging.getLogger("queryProcessor")
        file_handler = logging.FileHandler(LOG_FILE)
        self.__logger.addHandler(file_handler)

        self.__config = ConfigurationParser()
        self.__database = DatabaseFactory().getDatabase(
            self.__config.getEngineConfig("SmartPDFAssistant")['database'])

    def searchInElasticServer(self, query):
        es = ElasticServer()
        selected_titles = es.get_shard("esindex", query)
        return selected_titles

    def train(self, pdfname):
        print("\n pdfname : " + pdfname)
        processed = PDFProcessor(pdfname)
        processed.processPdf(pdfname)

    def merge(self, list1, list2):
        merged_list = tuple(zip(list1, list2))
        return merged_list

    # Will take the query and return the output
    def predict(self, query, wmdmodel):

        # Logging the query
        self.__logger.info("[{}] : Received Query : {}".format(
            datetime.now().strftime("%d/%m/%Y %H:%M:%S"), query))

        # Applying RAKE on query.
        r = Rake()
        query_keywords = r.extract_keywords_from_text(query)
        query_ranked_phrase = r.get_ranked_phrases()
        query = ''.join(query_ranked_phrase)

        # Creating objects
        wmd = WMD()
        qp = QueryProcessor()

        # Searching for titles matched with query in elastic server
        selected_titles = qp.searchInElasticServer(query)
        selected_titles_len = len(selected_titles)

        if not selected_titles:
            return "No results found!"

        # Using WMD to get vector distance between query and each of the selected titles
        wmdresponse = wmd.getWMDResponse(wmdmodel, query, selected_titles,
                                         selected_titles_len)

        response = self.merge(selected_titles, wmdresponse)

        # Sorting response according to vector distance (ascending)
        response = sorted(response, key=itemgetter(1))

        # Retrieving preprocessed data from DB
        db_data = self.__database.getFrom("PDFAssistant", "ProcessedPDF", '')

        # Selecting title having minimum vector distance with input query
        for i in range(len(db_data['Title'])):
            if response[0][0] == db_data['Title'][i]:
                break

        finalresponse = db_data['Text'][i]

        if not finalresponse:
            return "No results found! Please refine your search"

        # Logging the response
        self.__logger.info("[{}] : Answer Sent : {}".format(
            datetime.now().strftime("%d/%m/%Y %H:%M:%S"), finalresponse))
        self.__logger.info("--" * 30)

        # Insertion into DB
        self.__database.insertInto(
            "PDFAssistant", "QueryHistory", {
                'Date': datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                'Query': query,
                'Answer': finalresponse
            })

        return finalresponse
Example #6
0
import os
import logging
from datetime import datetime
from flask import Flask,request, render_template, url_for, redirect, jsonify

from Engines.QueryProcessor import QueryProcessor
from ConfigParser.ConfigParser import ConfigurationParser
from Factory.DatabaseFactory import DatabaseFactory
from Processor.WMD import WMD

app = Flask(__name__, static_url_path='',
            static_folder='templates',
            template_folder='templates')
config = ConfigurationParser()
db = DatabaseFactory().getDatabase(config.getEngineConfig("SmartPDFAssistant")['database'])
portNumber = int(config.getServerConfig()['port'])
queryProcessor = QueryProcessor()
wmd = WMD()

# Loading Glove vectors
wmdmodel = wmd.load()

LOG_FILE = "logs/server.log"
logger = logging.getLogger("server")
file_handler = logging.FileHandler(LOG_FILE)
logger.addHandler(file_handler)
logger.info("[{}] : Successfully hosted server at {}".format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"), str(portNumber)))


# Route for home page
@app.route('/')
Example #7
0
class Mongo(Database):
    def __init__(self):

        # This makes class singelton
        __metaclass__ = Singleton

        # Instating Logger
        LOG_FILE = "logs/database.log"
        self.__logger = logging.getLogger("mongo")
        file_handler = logging.FileHandler(LOG_FILE)
        self.__logger.addHandler(file_handler)

        self.__config = ConfigurationParser()

        # Database server configurations
        self.__port = int(self.__config.getDatabaseConfig()['port'])
        self.__host = self.__config.getDatabaseConfig()['host']
        self.__maxDelay = 10

        self.__connect()

    """
    Database server connection code
    """

    def __connect(self):
        try:
            self.__client = MongoClient(
                self.__host,
                self.__port,
                serverSelectionTimeoutMS=self.__maxDelay)
            self.__client.server_info()
            self.__logger.info(
                "[{}] : Successfully connected to database server at {}".
                format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                       self.__host + ":" + str(self.__port)))
            self.__logger.info("==" * 30)
        except:
            logging.error(
                "[{}] : Could not connect to database server. Database connection is down."
                .format(datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
            raise ConnectionError(
                "Could not connect to database server. Database connection is down."
            )

#-------------------------------------------------------------------------------------------

    """
    Selection of a Database
    """
    def getDB(self, db):
        try:
            return self.__client[db]
        except:
            self.__logger.error(
                "[{}] : Cannot find / create database {}".format(
                    datetime.now().strftime("%d/%m/%Y %H:%M:%S"), db))
            raise ValueError("Error finding / creating database {}".format(db))

    """
    Returns list of all available DBs
    """

    def getAllDB(self):
        return self.__client.list_database_names()

    """
    Get a collection
    """

    def getCollection(self, dbName, collectionName):
        try:
            db = self.getDB(dbName)
            collection = db[collectionName]
            return collection
        except:
            self.__logger.error(
                "[{}] : Cannot find / create collection {}".format(
                    datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                    collectionName))
            raise ValueError("Error finding / creating collection {}".format(
                collectionName))

    """
    Returns list of all collections
    """

    def getAllCollections(self, dbName=None):
        collections = []
        if isinstance(dbName, str):
            db = self.getDB(dbName)
            collections = db.list_collection_names()
        else:
            dbList = self.getAllDB()
            for db in dbList:
                collections.extend(self.getDB(db).list_collection_names())
        return collections

    """
    Insert a row in a table
    """

    def insertInto(self, dbName, collectionName, data):
        collection = self.getCollection(dbName, collectionName)
        if isinstance(data, list):
            collection.insert_many(data)
            self.__logger.info(
                "[{}] : Inserted {} rows into {} collection of {}".format(
                    datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                    str(len(data)), collectionName, dbName))
        elif isinstance(data, dict):
            collection.insert_one(data)
            self.__logger.info(
                "[{}] : Inserted a row into {} collection of {} database with values {}"
                .format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                        collectionName, dbName, str(data)))
        elif isinstance(data, pd.DataFrame):
            data = list(data.to_dict(orient='index').values())
            collection.insert_many(data)
            self.__logger.info(
                "[{}] : Inserted {} rows into {} collection of {}".format(
                    datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                    str(len(data)), collectionName, dbName))
        else:
            self.__logger.info(
                "[{}] : Data type not supported for insertion. Expected list of dict/ dict, Found {}"
                .format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                        str(type(data))))
            raise TypeError(
                "Data type not supported for insertion. Expected list of dict/ dict, Found {}"
                .format(str(type(data))))

    """
    Get data of a table
    """

    def getFrom(self,
                dbName,
                collectionName,
                top=10,
                data_type='dataframe',
                filter_criteria=None):
        try:
            collection = self.getCollection(dbName, collectionName)
            data = collection.find({})
            if data_type == 'dataframe':
                data = pd.DataFrame(data)
            elif data_type == 'models':
                json_data = {}
                for i in data:
                    json_data = i
                data = json_data
            return data
        except:
            self.__logger.info(
                "[{}] : Error in Querying {} collection of {} database".format(
                    datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                    collectionName, dbName))
            raise ValueError("No Data found with given name")