Esempio n. 1
0
    def pulverizeData(self):
        """
        Pulverizes the original raw data building up
        a map of links to be visited as well as cleaned up data
        """
        self.anchors = set()

        if not self.rawData:
            return
        text_content = None
        soup = BeautifulSoup(self.rawData)
        # Get rid of all script style and link rules
        for elem in soup.findAll(["script", "link", "style"]):
            elem.extract()

        text_content = str(soup.html.body)
        # Clean all unworthy characters in the text document
        cleaner = HtmlCleaner()
        self.content = cleaner.clean_text(text_content)
        # Ensure that the current uri is not crawled again by putting it
        # into the list of crawled uri
        PhrasetankSink.addSeenURI(self.currentUri)
        # Extract all links on the document
        links = soup.findAll("a", dict(href=True))

        for link in links:
            uri = PhrasetankRule.isCrawlable(self.baseUri, link["href"])
            if uri is not None:
                # Check if we had seen this link before
                if not PhrasetankSink.hasSeenURI(uri):
                    # Add this for later crawling
                    self.anchors.add(uri)
Esempio n. 2
0
 def fetchURIData(self,uri):
     """
     Visit the given uri and fetch its page content
     """
     contentsource = None
     print "Visiting page at "+uri
     
     try:
         client = HTTPClient()
         response = client.fetch(uri)
         contentsource = response.body or None
     except:
         pass
     
     if contentsource:
         # Send the content source to the pulverizer for processing
         pulverize = PhrasetankPulverizer()
         pulverize.setBaseURI(self.uri)
         pulverize.setCurrentURI(uri)
         pulverize.setRawData(contentsource)
         #process the data
         pulverize.pulverizeData()
         
         links = pulverize.getLinks() or []
         content = pulverize.getTextContent() or ''
         if links and len(links):
             #self.uriTank.extend(links)
             for l in links:
                 self.uriTank.add(l)
         
         if content and len(content):
             # call the consumer for further processing of the data content
             consumer = PhrasetankConsumer()
             consumer.setDataContent(content)
             consumer.setProducerName(self.name)
             consumer.start()
     else:
         # Perhaps the link has an issue we need to not visit it again
         PhrasetankSink.addSeenURI(uri)
         logMessage("ERROR",'Failed to read content source for '+uri)
Esempio n. 3
0
 def run(self):
     """
     Start running the producer
     """
     logMessage('START','PhrasetankProducer '+self.name+' is starting to crawl '+self.uri+' ...')
     
     while True:
         
         uri = None
         
         try:
             uri = self.uriTank.pop()
         except:
             pass
         
         if not uri:
             break
         # make sure that we have not visited this link before
         if PhrasetankSink.hasSeenURI(uri):
             #Log the seen status
             logMessage('SEEN',uri+' has been visited before now...')
             continue
         #Ok we are ready to visit the link
         self.fetchURIData(uri)