Beispiel #1
0
 def process_as_rss(self, document):
     self.document = document
     self.parse()
     items = self.items()
     for item in items:
         pubdate = item.findAll('dc:date')
         if not pubdate:
             continue
         published_at = self.get_datetime(pubdate.pop().get_text())
         links = [ link.next_sibling for link in item.findAll('link') ]
         for link in links:
             if not link or note.exists(link):
                 continue
             try:
                 self.fetch_and_clean_dom(link)
             except EmptyDOM: # Server returned an empty response.
                 continue
             prioritya = ".  ".join(self.h1s())
             priorityb = ".  ".join(self.h2s())
             priorityc = ".  ".join(self.h3s())
             priorityd = ".  ".join([ a[0] for a in self.as_() ])
             prioritye = " ".join(self.ps())
             
             n, errors = note.get_or_create(link, prioritya, priorityb, priorityc, priorityd, prioritye, published_at)
             if errors:
                 sys.stderr.write(str(errors) + "\n")
Beispiel #2
0
    def process_as_rss(self, document):
        """
        Processes a document assuming RSS format. It follows the links to the articles 
        and saves them with the appropriate priority for the tag text is found under.
        
        :param unicode document: The RSS source code
        
        :rtype None:
        """
        self.document = document
        self.parse()
        items = self.items()
        for item in items:
            pubdate = item.findAll("pubdate")
            if not pubdate:
                continue
            published_at = self.get_datetime(pubdate.pop().get_text())
            links = [link.next_sibling for link in item.findAll("link")]
            for link in links:
                if not link or note.exists(link):
                    continue
                try:
                    self.fetch_and_clean_dom(link)
                except EmptyDOM:  # Server returned an empty response.
                    continue
                prioritya = ".  ".join(self.h1s())
                priorityb = ".  ".join(self.h2s())
                priorityc = ".  ".join(self.h3s())
                priorityd = ".  ".join([a[0] for a in self.as_()])
                prioritye = " ".join(self.ps())

                n, errors = note.get_or_create(
                    link, prioritya, priorityb, priorityc, priorityd, prioritye, published_at
                )
                if errors:
                    sys.stderr.write(str(errors) + "\n")
Beispiel #3
0
 def test_exists(self):
     n, created = self.get_note()
     self.assertTrue(note.exists(n.identifier))