Ejemplo n.º 1
0
    def __init__(self):

        uid = str(uuid.uuid4())
        self.scraper = Scraper(uid=uid)
Ejemplo n.º 2
0
class Wrapper():

    def __init__(self):

        uid = str(uuid.uuid4())
        self.scraper = Scraper(uid=uid)

    def processDoc(self,payload):

        """
        {
            'sourceid': 'a0351229-3ebd-42b2-add7-4dea1012f96b', 
            'destinationid': 'broadcast', 
            'command': 'found_doc', 
            'message': {
                'scrapedatetime': '2014-02-03 22:47:25', 
                'docurl': u'http://www.growmonroe.org/files/file/Minutes/2014/MINUTES_2014_0121_Draft.pdf', 
                'urldata': {
                    'maxlinklevel': 1, 
                    'description': 'COMIDA Meeting Minutes Website', 
                    'title': 'COMIDA Minutes', 
                    'doctype': 'application/pdf', 
                    'targeturl': 'http://www.growmonroe.org/board-meetings', 
                    'creationdatetime': '2014-02-03 22:47:20'
                }, 
                'linktext': u'Minutes'
            }
        }
        """

        scrapedatetime = payload['message']['scrapedatetime']
        docurl = payload['message']['docurl']
        linktext = payload['message']['linktext']

        success,pdftext,pdfhash,created = self.getText(docurl)

        if success:
            
            # create instance of our elastic search api
            es = elasticsearch.Elasticsearch()

            # create the body we are going to send to elastic search
            body = {
                'scrapedatetime': scrapedatetime,
                'docurl': docurl,
                'linktext': linktext,
                'pdftext': pdftext,
                'pdfhash': pdfhash,
                'created': created,
            }

            # make sure the doc doesn't already exist within the index
            if not self.checkexists(pdfhash):
                retval = True

                # index the doc
                es.index(
                    index="comida",
                    doc_type="pdfdoc",
                    id=uuid.uuid4(),
                    body=body,
                )

                print "Document Indexed."

            else:
                print "Skipping Document, Already Indexed."

        else:
            print "Error processing PDF document."

    def getText(self,url):

        # download the pdf
        dler = DLer()
        downloaddir = "./downloads"
        files,success = dler.dl([url],downloaddir)

        # check for success, and get filename
        if success:
            filename,dldatetime = files[0]
        else:
            print "Error downloading pdf document."

        # convert the pdf
        unpdfer = UnPDFer(filename)
        created,pdftext,pdfhash,success = unpdfer.unpdf(filename,SCRUB=True,verbose=False)

        print created

        if success:

            # make all white space spaces 
            pdftext = pdftext.replace('\n',' ')
            pdftext = pdftext.replace('\r',' ')
            pdftext = pdftext.replace('\t',' ')

            # this is done 4 times for the situation of ' \n\t \n\t' 
            for i in range(0,3):
                pdftext = re.sub(' +',' ',pdftext)
                pdftext = re.sub('\n+','\n',pdftext)
                pdftext = re.sub('\t+','\t',pdftext)

            # recalculate the hash of the text
            hashlib.md5(pdftext).hexdigest()

        return success,pdftext,pdfhash,created

    def checkexists(self,pdfhash):

        # handle misfit case
        if pdfhash == "":
            return False

        body = {
            "query": {
                "match": {
                    "pdfhash": pdfhash
                }
            }
        }
        try:
            results = self.es.search(index="comida",
                                     body=body
            )
        except:
            # if we get here, the index is probably empty
            return False
        exists = False
        if len(results['hits']['hits']) > 0:
            exists = True

        return exists


    def finishedCallback(self,payload):
        #print "Scraper Stpped ..."
        self.scraper.stop()

    def startedCallback(self,payload):
        #print "Scraper Started ..."
        pass

    def broadcastDocCallback(self,payload):
        #print "Document Found ..."

        #print payload

        self.processDoc(payload)

        #raise Exception('debug')

    def go(self):

        print "Creating Scraper ..."

        #uid = str(uuid.uuid4())   
        #scraper = Scraper(uid=uid)

        print "Setting up Scraper ..."

        self.scraper.setFinishedCallback(self.finishedCallback)
        self.scraper.setStartedCallback(self.startedCallback)
        self.scraper.setBroadcastDocCallback(self.broadcastDocCallback)

        urldata = {
            'targeturl': "http://www.growmonroe.org/board-meetings",
            'title': "COMIDA Minutes",
            'description': "COMIDA Meeting Minutes Website",
            'maxlinklevel': 1,
            'creationdatetime': str(strftime("%Y-%m-%d %H:%M:%S")),
            'doctype': 'application/pdf',
        }

        self.scraper.seturldata(urldata)

        print "Starting Scraper ..."

        self.scraper.start()
        self.scraper.begin()