Esempio n. 1
0
    def __init__(self, downloaddir='./downloads', entityid=None, DEBUG=False):
        #threading.Thread.__init__(self)

        self._stop = threading.Event()
        self._interval = 1

        self.downloaddir = downloaddir
        self.entityid = entityid
        self.DEBUG = DEBUG

        if self.DEBUG:
            if self.entityid != None:
                print "Using conditional EntityID."

        self.unpdfer = Unpdfer()

        #self.searchapi = Search()
        self.myid = str(uuid.uuid4())
        self.dbaccess = Access(DEBUG=True)

        # setup access layer
        #self.myid = str(uuid.uuid4())
        #self.busaccess = BusAccess(myid=self.myid,DEBUG=True)
        #self.busaccess.setcallback(self._callback)

        # start seperate thread with listener in it
        #self.listenthread = threading.Thread(target=self.busaccess.listen)
        #self.listenthread.start()

        if self.DEBUG:
            print "Converter INIT completed successfully."
Esempio n. 2
0
    def __init__(self,downloaddir='./downloads',entityid=None,DEBUG=False):
        #threading.Thread.__init__(self)

        self._stop = threading.Event()
        self._interval = 1

        self.downloaddir = downloaddir
        self.entityid = entityid
        self.DEBUG = DEBUG

        if self.DEBUG:
            if self.entityid != None:
                print "Using conditional EntityID."

        self.unpdfer = Unpdfer()

        #self.searchapi = Search()
        self.myid = str(uuid.uuid4())
        self.dbaccess = Access(DEBUG=True)

        # setup access layer
        #self.myid = str(uuid.uuid4())
        #self.busaccess = BusAccess(myid=self.myid,DEBUG=True)
        #self.busaccess.setcallback(self._callback)
        
        # start seperate thread with listener in it
        #self.listenthread = threading.Thread(target=self.busaccess.listen)
        #self.listenthread.start()

        if self.DEBUG:
            print "Converter INIT completed successfully."
Esempio n. 3
0
class Converter():
    def __init__(self, downloaddir='./downloads', entityid=None, DEBUG=False):
        #threading.Thread.__init__(self)

        self._stop = threading.Event()
        self._interval = 1

        self.downloaddir = downloaddir
        self.entityid = entityid
        self.DEBUG = DEBUG

        if self.DEBUG:
            if self.entityid != None:
                print "Using conditional EntityID."

        self.unpdfer = Unpdfer()

        #self.searchapi = Search()
        self.myid = str(uuid.uuid4())
        self.dbaccess = Access(DEBUG=True)

        # setup access layer
        #self.myid = str(uuid.uuid4())
        #self.busaccess = BusAccess(myid=self.myid,DEBUG=True)
        #self.busaccess.setcallback(self._callback)

        # start seperate thread with listener in it
        #self.listenthread = threading.Thread(target=self.busaccess.listen)
        #self.listenthread.start()

        if self.DEBUG:
            print "Converter INIT completed successfully."

    #def _callback(self,response):
    #
    #    if self.DEBUG:
    #        print "Callback called."
    #
    #    if response['command'] == 'global_shutdown':
    #        if self.DEBUG:
    #            print "Global Shutdown Recieved"
    #        self.busaccess.stoplistening()
    #        self.stop()

    def start(self):
        if self.DEBUG:
            print "Converter thread started."

        #self.busaccess.sendmsg(
        #    command='mm_converter_online',
        #    destinationid='broadcast',
        #    message={
        #        'datetime': str(strftime("%Y-%m-%d %H:%M:%S"))
        #    },
        #)

        # start a timer to see if we should be exiting
        threading.Timer(self._interval, self.convertdoc).start()

        if self.DEBUG:
            print "start() exiting."

    def stop(self):
        if self.DEBUG:
            print "Converter thread is stopping."

        #self.busaccess.sendmsg(
        #    command='mm_converter_offline',
        #    destinationid='broadcast',
        #    message={
        #        'datetime': str(strftime("%Y-%m-%d %H:%M:%S"))
        #    },
        #)

        self.listenthread.stop()

        # set our stop flag
        self._stop.set()

    def convertdoc(self):

        #if self.DEBUG:
        #    print "Entering convertdoc() ..."

        #print "exiting convertdoc() because of debug."
        #return

        #try:
        if True:

            if self.DEBUG:
                print '[{0}] Checking for unconverted documents ...'.format(
                    str(strftime("%Y-%m-%d %H:%M:%S")))

            # get the next unconverted document
            doc = self.getunconverted(self.entityid)

            if doc == None:

                # All documents have been converted, nothing to do here.

                if self.DEBUG:
                    print "No documents to process."

                pass

            else:

                #if self.entityid != None:
                #    if self.DEBUG:
                #        print "Doc EntityID: '{0}', Conditional EntityID: '{1}'".format(doc['entityid'], self.entityid)
                #    if doc['entityid'] != self.entityid:
                #        if self.DEBUG:
                #            print "Specific EntityID specified, but no match found.  Skipping document."
                #        pass
                #
                #else:

                if True:

                    if self.DEBUG:
                        print 'Found a document to convert.'

                    print doc

                    # decode fields
                    pdffilename = doc['docname']
                    docurl = doc['docurl']
                    linktext = doc['linktext']
                    urldata = doc['urldata']
                    scrapedatetime = doc['scrapedatetime']

                    if self.DEBUG:
                        print 'Converting PDF to text ...'

                    # convert to text
                    created, pdftext, pdfhash, success = self.getpdftext(
                        pdffilename)

                    if not success:

                        if self.DEBUG:
                            print "An error has occured while converting the PDF."

                    else:

                        if self.DEBUG:
                            print "Saving document text to file store ..."

                        # Save text doc to file store
                        textfilename = "%s.txt" % pdffilename
                        self.savetext(textfilename, pdftext)

                        # decode the document name
                        docname = urllib2.unquote(docurl.split('/')[-1])

                        if self.DEBUG:
                            print "Document saved: {0}".format(textfilename)

                        #raise Exception('debug')

                        if self.DEBUG:
                            print "Placing document text into database ..."

                        # reset the converting flag
                        self.setconverted(docurl)

                        # set the pdf data for the doc
                        self.setconvertdata(docurl, pdftext, pdfhash, created)

                        if self.DEBUG:
                            print "New document converted successfully."
        #except:
        #    if self.DEBUG:
        #        print "An error has happeend while trying to convert the document."

        if not self._stop.isSet():
            # start a timer to see if we should be exiting
            threading.Timer(self._interval, self.convertdoc).start()
        else:
            if self.DEBUG:
                print "Stop seen - not firing timer event."

    def getunconverted(self, entityid):
        doc = self.dbaccess.getunconverted(entityid)
        return doc

    def setconverted(self, docurl):
        doc = self.dbaccess.setconverted(docurl)
        return doc

    def setconvertdata(self, docurl, pdftext, pdfhash, created):
        doc = self.dbaccess.setconvertdata(docurl, pdftext, pdfhash, created)
        print "Document Converted: {0}".format(doc)
        return doc

    def savetext(self, filename, text):

        if self.DEBUG:
            print "Saving document text to filestore ..."
            print "Document length: {0}".format(len(text))

        # note: will overwrite any existing file
        with open(filename, "w") as f:
            f.write(text)

        if self.DEBUG:
            print "Document text successfully written to file store."

    def getpdftext(self, filename, SCRUB=True):

        if self.DEBUG:
            print "Trying to convert document to text ..."

        #unpdfer = Unpdfer(filename)
        created, pdftext, pdfhash, success = self.unpdfer.unpdf(
            filename, SCRUB=SCRUB, verbose=self.DEBUG)
        if success:
            retval = (created, pdftext, pdfhash, True)

            if self.DEBUG:
                print "Document successfully converted from PDF to Text."
        else:
            if self.DEBUG:
                print "Error in PDF->Text conversion:"
            retval = (None, None, None, False)

        return retval
Esempio n. 4
0
class Converter():

    def __init__(self,downloaddir='./downloads',entityid=None,DEBUG=False):
        #threading.Thread.__init__(self)

        self._stop = threading.Event()
        self._interval = 1

        self.downloaddir = downloaddir
        self.entityid = entityid
        self.DEBUG = DEBUG

        if self.DEBUG:
            if self.entityid != None:
                print "Using conditional EntityID."

        self.unpdfer = Unpdfer()

        #self.searchapi = Search()
        self.myid = str(uuid.uuid4())
        self.dbaccess = Access(DEBUG=True)

        # setup access layer
        #self.myid = str(uuid.uuid4())
        #self.busaccess = BusAccess(myid=self.myid,DEBUG=True)
        #self.busaccess.setcallback(self._callback)
        
        # start seperate thread with listener in it
        #self.listenthread = threading.Thread(target=self.busaccess.listen)
        #self.listenthread.start()

        if self.DEBUG:
            print "Converter INIT completed successfully."

    #def _callback(self,response):
    #
    #    if self.DEBUG:
    #        print "Callback called."
    #
    #    if response['command'] == 'global_shutdown':
    #        if self.DEBUG:
    #            print "Global Shutdown Recieved"
    #        self.busaccess.stoplistening()
    #        self.stop()

    def start(self):
        if self.DEBUG:
            print "Converter thread started."

        #self.busaccess.sendmsg(
        #    command='mm_converter_online',
        #    destinationid='broadcast',
        #    message={
        #        'datetime': str(strftime("%Y-%m-%d %H:%M:%S"))
        #    },
        #)

        # start a timer to see if we should be exiting
        threading.Timer(self._interval,self.convertdoc).start()

        if self.DEBUG:
            print "start() exiting."

    def stop(self):
        if self.DEBUG:
            print "Converter thread is stopping."

        #self.busaccess.sendmsg(
        #    command='mm_converter_offline',
        #    destinationid='broadcast',
        #    message={
        #        'datetime': str(strftime("%Y-%m-%d %H:%M:%S"))
        #    },
        #)

        self.listenthread.stop()

        # set our stop flag
        self._stop.set()

    def convertdoc(self):

        #if self.DEBUG:
        #    print "Entering convertdoc() ..."

        #print "exiting convertdoc() because of debug."
        #return
          
        #try: 
        if True: 
                
            if self.DEBUG:
                print '[{0}] Checking for unconverted documents ...'.format(str(strftime("%Y-%m-%d %H:%M:%S")))

            # get the next unconverted document
            doc = self.getunconverted(self.entityid)

            if doc == None:
 
                # All documents have been converted, nothing to do here.

                if self.DEBUG:
                    print "No documents to process."

                pass

            else:

                #if self.entityid != None:
                #    if self.DEBUG:
                #        print "Doc EntityID: '{0}', Conditional EntityID: '{1}'".format(doc['entityid'], self.entityid)
                #    if doc['entityid'] != self.entityid:
                #        if self.DEBUG:
                #            print "Specific EntityID specified, but no match found.  Skipping document."
                #        pass
                #
                #else:

                if True:

                    if self.DEBUG:
                        print 'Found a document to convert.'

                    print doc

                    # decode fields
                    pdffilename    = doc['docname']
                    docurl         = doc['docurl']
                    linktext       = doc['linktext']
                    urldata        = doc['urldata']
                    scrapedatetime = doc['scrapedatetime']

                    if self.DEBUG:
                        print 'Converting PDF to text ...'

                    # convert to text
                    created,pdftext,pdfhash,success = self.getpdftext(pdffilename)

                    if not success:
 
                        if self.DEBUG:
                            print "An error has occured while converting the PDF."

                    else:

                        if self.DEBUG:
                            print "Saving document text to file store ..."

                        # Save text doc to file store
                        textfilename = "%s.txt" % pdffilename
                        self.savetext(textfilename,pdftext)

                        # decode the document name
                        docname = urllib2.unquote(docurl.split('/')[-1])

                        if self.DEBUG:
                            print "Document saved: {0}".format(textfilename)

                        #raise Exception('debug')

                        if self.DEBUG:
                            print "Placing document text into database ..."

                        # reset the converting flag
                        self.setconverted(docurl)

                        # set the pdf data for the doc
                        self.setconvertdata(docurl,pdftext,pdfhash,created)

                        if self.DEBUG:
                            print "New document converted successfully."
        #except:
        #    if self.DEBUG:
        #        print "An error has happeend while trying to convert the document."

        if not self._stop.isSet():
            # start a timer to see if we should be exiting
            threading.Timer(self._interval,self.convertdoc).start()
        else:
            if self.DEBUG:
                print "Stop seen - not firing timer event."

    def getunconverted(self,entityid):
        doc = self.dbaccess.getunconverted(entityid)
        return doc

    def setconverted(self,docurl):
        doc = self.dbaccess.setconverted(docurl)
        return doc

    def setconvertdata(self,docurl,pdftext,pdfhash,created):
        doc = self.dbaccess.setconvertdata(docurl,pdftext,pdfhash,created)
        print "Document Converted: {0}".format(doc)
        return doc

    def savetext(self,filename,text):

        if self.DEBUG:
            print "Saving document text to filestore ..."
            print "Document length: {0}".format(len(text))

        # note: will overwrite any existing file
        with open(filename,"w") as f:
            f.write(text)

        if self.DEBUG:
            print "Document text successfully written to file store."

    def getpdftext(self,filename,SCRUB=True):

        if self.DEBUG:
            print "Trying to convert document to text ..."

        #unpdfer = Unpdfer(filename)
        created,pdftext,pdfhash,success = self.unpdfer.unpdf(filename,SCRUB=SCRUB,verbose=self.DEBUG)
        if success:
            retval = (created,pdftext,pdfhash,True)

            if self.DEBUG:
                print "Document successfully converted from PDF to Text."
        else:
            if self.DEBUG:
                print "Error in PDF->Text conversion:"
            retval = (None,None,None,False)

        return retval