def migrate_pages(self, urls, root_url): for url in urls: self.__current_context = self.__inital_context folders = url[len(root_url):].split('/') folders.pop() while '' in folders: folders.remove('') for folder in folders: self.__create_folder(folder) self.__current_context = self.__current_context[folder] try: downloaded_html = urllib.urlopen(url) except IOError: self.__errors.append("Error connecting to: " + url) continue html = downloaded_html.read() reader = HTMLReader() try: reader.feed(html) except HTMLParseError: self.__errors.append("Error parsing: " + url) else: title = reader.get_title() filename = url.split('/').pop() cleanedHTML = scrubHTML(html, raise_error=False) self.__create_page(filename, title, cleanedHTML)
def safe_html(html): return scrubHTML(html, raise_error=False)
def get_citation_string(real_self): """ >>> from mock import Mock >>> at_mock = Mock() >>> at_mock.get = lambda x: None >>> at_mock.customCitation = '' >>> at_mock.title = "Plone Mag♥" >>> at_mock.reviewAuthors = [{'firstname' : 'Cillian♥', 'lastname' : 'de Roiste♥'}] >>> at_mock.yearOfPublication = '2009♥' >>> at_mock.officialYearOfPublication = '2010♥' >>> at_mock.publisher = 'SYSLAB.COM GmbH♥' >>> at_mock.placeOfPublication = 'München♥' >>> at_mock.volumeNumber = '1♥' >>> at_mock.issueNumber = '3♥' >>> at_mock.get_issue_title = lambda :'Open Source Mag 1♥' >>> at_mock.get_volume_title = lambda :'Open Source Mag Vol 1♥' >>> at_mock.get_publication_title = lambda :'Open Source♥' >>> at_mock.portal_url = lambda :'http://www.syslab.com' >>> at_mock.UID = lambda :'12345' >>> at_mock.canonical_uri = '' >>> at_mock.page_start_end_in_print = '11-21' >>> review = ReviewJournalNoMagic(at_mock) >>> review.directTranslate = lambda m: m.default >>> review.get_citation_string() u'de Roiste\u2665, Cillian\u2665: review of: Plone Mag\u2665, 1\u2665, 3\u2665 (2010\u2665/2009\u2665), in: Open Source\u2665, Open Source Mag Vol 1\u2665, Open Source Mag 1\u2665, p. 11-21, <a href="http://syslab.com/r/12345">http://syslab.com/r/12345</a>' Return the citation according to this schema: [Rezensent Nachname], [Rezensent Vorname]: review of: [Zs-Titel der rez. Zs.], [Nummer], [Heftnummer (gezähltes Jahr/Erscheinungsjahr)], in: [Zs-Titel], [Nummer], [Heftnummer (gezähltes Jahr/Erscheinungsjahr)], p.[pageStart]-[pageEnd] URL recensio. The years of the magazine article reviewing the other magazine does not exist. """ self = real_self.magic if self.customCitation: return scrubHTML(self.customCitation).decode('utf8') rev_details_formatter = getFormatter(u', ', u', ', u' ') mag_year = getFormatter('/')(self.officialYearOfPublication, self.yearOfPublication) mag_year = mag_year and '(' + mag_year + ')' or None item_string = rev_details_formatter(self.title, self.volumeNumber, self.issueNumber, mag_year) reference_mag = getFormatter(', ', ', ') reference_mag_string = reference_mag(self.get_publication_title(), self.get_volume_title(), self.get_issue_title()) location = real_self.getUUIDUrl() if getattr(self, "canonical_uri", False): #3102 location = real_self.directTranslate( Message(u"label_downloaded_via_recensio", "recensio")) rezensent_string = get_formatted_names( u' / ', ', ', self.reviewAuthors, lastname_first = True) args = { 'review_of' : real_self.directTranslate(Message( u"text_review_of", "recensio", default="review of:")), 'in' : real_self.directTranslate(Message( u"text_in", "recensio", default="in:")), 'page' : real_self.directTranslate(Message( u"text_pages", "recensio", default="p.")), ':' : real_self.directTranslate(Message( u"text_colon", "recensio", default=":")), } citation_formatter = getFormatter( u'%(:)s %(review_of)s ' % args, ', %(in)s ' % args, ', %(page)s ' % args, u', ') citation_string = citation_formatter( escape(rezensent_string), escape(item_string), escape(reference_mag_string), self.page_start_end_in_print, location) return citation_string
def safe_html(html): """ Returns the given html without dangerous elements like <script/>. The result will still contain html however, not replacments like '>'. """ return scrubHTML(html, raise_error=False)
def get_citation_string(real_self): """ Either return the custom citation or the generated one >>> from mock import Mock >>> at_mock = Mock() >>> at_mock.customCitation = '' >>> at_mock.get = lambda x: None >>> at_mock.formatted_authors_editorial = u"Gerken\u2665, Patrick\u2665 / Pilz, Alexander" >>> at_mock.title = "Plone 4.0♥?" >>> at_mock.subtitle = "Das Benutzerhandbuch♥" >>> at_mock.reviewAuthors = [{'firstname' : 'Cillian♥', 'lastname' : 'de Roiste♥'}] >>> at_mock.yearOfPublication = '2009♥' >>> at_mock.publisher = 'SYSLAB.COM GmbH♥' >>> at_mock.placeOfPublication = 'München♥' >>> at_mock.get_issue_title = lambda :'Open Source Mag 1♥' >>> at_mock.get_volume_title = lambda :'Open Source Mag Vol 1♥' >>> at_mock.get_publication_title = lambda :'Open Source♥' >>> at_mock.portal_url = lambda :'http://www.syslab.com' >>> at_mock.UID = lambda :'12345' >>> at_mock.canonical_uri = '' >>> at_mock.page_start_end_in_print = '11-21' >>> review = ReviewMonographNoMagic(at_mock) >>> review.directTranslate = lambda m: m.default >>> review.get_citation_string() u'de Roiste\u2665, Cillian\u2665: review of: Gerken\u2665, Patrick\u2665 / Pilz, Alexander, Plone 4.0\u2665? Das Benutzerhandbuch\u2665, M\\xfcnchen\u2665: SYSLAB.COM GmbH\u2665, 2009\u2665, in: Open Source\u2665, Open Source Mag Vol 1\u2665, Open Source Mag 1\u2665, p. 11-21, <a href="http://syslab.com/r/12345">http://syslab.com/r/12345</a>' Original Spec: [Rezensent Nachname], [Rezensent Vorname]: review of: [Werkautor Nachname], [Werkautor Vorname], [Werktitel]. [Werk-Untertitel], [Erscheinungsort]: [Verlag], [Jahr], in: [Zs-Titel], [Nummer], [Heftnummer (Erscheinungsjahr)], p.[pageStart]-[pageEnd] URL recensio. Werkautoren kann es mehrere geben, die werden dann durch ' / ' getrennt alle aufgelistet. Note: gezähltes Jahr entfernt. Da es die Felder Zs-Titel, Nummer und Heftnummer werden die Titel der Objekte magazine, volume, issue genommen, in dem der Review liegt Müller, Klaus: review of: Meier, Hans, Geschichte des Abendlandes. Ein Abriss, München: Oldenbourg, 2010, in: Zeitschrift für Geschichte, 39, 3 (2008/2009), www.recensio.net/## """ self = real_self.magic if self.customCitation: return scrubHTML(self.customCitation).decode("utf8") args = { "review_of": real_self.directTranslate(Message(u"text_review_of", "recensio", default="review of:")), "in": real_self.directTranslate(Message(u"text_in", "recensio", default="in:")), "page": real_self.directTranslate(Message(u"text_pages", "recensio", default="p.")), ":": real_self.directTranslate(Message(u"text_colon", "recensio", default=":")), } if self.title[-1] in "!?:;.,": title_subtitle = getFormatter(u" ") else: title_subtitle = getFormatter(u". ") rev_details_formatter = getFormatter(u", ", u", ", u"%(:)s " % args, u", ") rezensent_string = get_formatted_names(u" / ", ", ", self.reviewAuthors, lastname_first=True) authors_string = self.formatted_authors_editorial title_subtitle_string = title_subtitle(self.title, self.subtitle) item_string = rev_details_formatter( authors_string, title_subtitle_string, self.placeOfPublication, self.publisher, self.yearOfPublication ) mag_year_string = self.yearOfPublication.decode("utf-8") mag_year_string = mag_year_string and u"(" + mag_year_string + u")" or None mag_number_formatter = getFormatter(u", ", u", ") mag_number_string = mag_number_formatter( self.get_publication_title(), self.get_volume_title(), self.get_issue_title() ) location = real_self.getUUIDUrl() if getattr(self, "canonical_uri", False): # 3102 location = real_self.directTranslate(Message(u"label_downloaded_via_recensio", "recensio")) citation_formatter = getFormatter( u"%(:)s %(review_of)s " % args, ", %(in)s " % args, ", %(page)s " % args, u", " ) citation_string = citation_formatter( escape(rezensent_string), escape(item_string), escape(mag_number_string), self.page_start_end_in_print, location, ) return citation_string