Ejemplos de BeautifulSoup.prettify en Python, ejemplos de lib.BeautifulSoup.BeautifulSoup.prettify en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: customtest.py Proyecto: poochin/feedbyselectors

    def seturl(self):
        '''URLとURLからフェッチして保存します'''
        user = common.currentuser()
        if not user:
            common.error(self, 404, "User not found.")
            return

        ct = models.CustomTest.all().ancestor(user).get()
        if not ct:
            ct = models.CustomTest(parent=user)

        ct.setbypost(self.request.POST)

        if not ct.rss_link:
            soup = Soup(defines.defaulttesthtml)
        else:
            result = urlfetch.fetch(ct.rss_link)
            if result.status_code != 200:
                common.error(self, 200, "Url Fetch Error")
                return
            soup = Soup(result.content)

        try: 
            ct.data = soup.prettify().decode('UTF-8')
        except ValueError, message:
            common.error(self, 200, message)
            return

Ejemplo n.º 2

0

Mostrar archivo

Archivo: cron.py Proyecto: mshafrir/Junkscast

    def get(self):
        self.response.headers['Content-Type'] = 'text/plain'

        day = date.today() - relativedelta(days=1)
        response = fetch_url("http://podcastrss.play.it/the-sports-junkies_mp3_128.xml")
        if response and response.status_code == 200:
            feed_soup = BeautifulSoup(response.content)
            [copyright_el.extract() for copyright_el in feed_soup.findAll("copyright")]

            self.response.out.write("%s\n\n\n" % feed_soup.prettify())
            DailyFeedSnapshot.create(day, feed_soup.prettify())
            msg = "Created a DailyFeedSnapshot for %s." % (day)
            self.response.out.write(msg)
            logging.info(msg)
        else:
            msg = "Could not create a DailyFeedSnapshot for %s." % (day)
            self.response.out.write(msg)
            logging.error(msg)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: LibRuParser.py Proyecto: ktisha/ebook-service

    def create_book(self):
        if self.fail:
            return False
        paragraphs = []
        temp_file = self.file.decode("utf-8")

#        temp_file = self.__erase_xml_illegal_chars(temp_file)

        if not "temp_dir" in dir(self):
            self.temp_dir = tempfile.mkdtemp()
        if not self.names:
            file = open(self.temp_dir + "/0.html", 'w')
            file.write(self.file)
            file.close()
            os.system(EPUBER_DIR + '/remove_illegal.py <' + self.temp_dir + "/0.html >" + self.temp_dir + "/tmp")
            shutil.move(self.temp_dir + "/tmp", self.temp_dir + "/0.html")
            self.book.add_file(self.temp_dir + "/0.html", 'c0', "")
        else:
        
            for i, name in enumerate(self.names):
                split_index = temp_file.find(name)
                if i == 0:
                    paragraph = ""
                else:
                    paragraph = self.HTML_HEADER
                
                paragraph += temp_file[:split_index]
                soup = BeautifulSoup(paragraph)
                paragraph = soup.prettify()
                paragraphs.append(paragraph)
                temp_file = temp_file[split_index:]
                #soup = BeautifulSoup(temp_file)
                #temp_file = soup.prettify()
            paragraphs.append(BeautifulSoup(self.HTML_HEADER + temp_file).prettify())
        for i, paragraph in enumerate(paragraphs):
            file = open(self.temp_dir + "/%d.html" % i, 'w')
            file.write(paragraph)
            file.close()
            os.system(EPUBER_DIR + '/remove_illegal.py <' + self.temp_dir + "/%d.html >" % i + self.temp_dir + "/tmp")
            shutil.move(self.temp_dir + "/tmp", self.temp_dir + "/%d.html" % i)
            self.book.add_file(self.temp_dir + "/%d.html" % i, 'c%d' % i, self.titles[i])
        for i, image in enumerate(self.images):
            self.book.add_file(image, self.temp_dir + '/im%d' % i, title="", in_spine=False)
        self.book.pack()
        return True

Ejemplo n.º 4

0

Mostrar archivo

Archivo: yahoo.py Proyecto: mshafrir/Rotoist

def parse_matchup_info(league, team_id):
	logging.info("team_id: %d" % team_id)
	
	generic_matchup_url = build_url(league_id=league.id, page='matchup', params={'mid1': team_id}, access_code=league.access_code)
	try:
		matchup_soup = BeautifulSoup(urlfetch.fetch(generic_matchup_url).content).find('div', attrs={'class': 'scoreboard'}).find('li')
	except:
		matchup_soup = None	
		
	logging.info("\n\n\n%s\n\n\n" % matchup_soup.prettify())
	
	if matchup_soup:
		team_names = [str(row.find('a').contents[0]).strip() for row in matchup_soup.findAll('tr')]
		score = [float(pts.contents[0]) for pts in matchup_soup.findAll('td', attrs={'class': 'pts'})]
	else:
		team_names = None
		score = None
	
	if team_names and score:
		return {'score': score, 'team_names': team_names}
	else:
		return None

Ejemplo n.º 5

0

Mostrar archivo

Archivo: item.py Proyecto: nrolland/google-reader-iphone-sync

class Item:
	"""
	A wrapper around a GoogleReader item
	"""
	def __init__(self, feed_item = None, tag_name = '(unknown)', raw_data = None):
		self.had_errors = False
		if feed_item is not None:
			try: self.feed_name = feed_item['feed_name']
			except (KeyError, TypeError):
				self.feed_name = tag_name
			self.tag_name = tag_name
			self.title = strip_html_tags(feed_item['title'])
			self.title = unicode(BeautifulSoup(self.title, convertEntities = BeautifulSoup.HTML_ENTITIES))
			self.google_id = feed_item['google_id']
			self.date = time.strftime('%Y%m%d%H%M%S', time.localtime(float(feed_item['updated'])))
			self.is_read = 'read' in feed_item['categories']
			self.is_starred = 'starred' in feed_item['categories']
			self.is_shared = 'broadcast' in feed_item['categories']
			self.url = feed_item['link']
			self.content = feed_item['content']
			self.original_id = feed_item['original_id']
			self.media = try_lookup(feed_item, 'media')
			self.is_dirty = False
			self.is_stale = False
		else:
			# just copy the dict's keys to my instance vars
			for key,value in raw_data.items():
				setattr(self, key, value)
		
		# calculated attributes that aren't stored in the DB
		self.safe_google_id = Item.escape_google_id(self.google_id)
		self.resources_path = "%s/%s/%s" % (app_globals.OPTIONS['output_path'], app_globals.CONFIG['resources_path'], self.safe_google_id)
		self.basename = self.get_basename()
	
	@staticmethod
	def unescape_google_id(safe_google_id):
		return urllib.unquote(safe_google_id)

	@staticmethod
	def escape_google_id(unsafe_google_id):
		return urllib.quote(unsafe_google_id, safe='')

	def get_basename(self):
		return utf8(
			self.date + ' ' +
			filter(lambda x: x not in '"\':#!+/$\\?*', ascii(self.title))[:120] + ' .||' +
			self.safe_google_id + '||' )

	def soup_setup(self):
		self.soup = BeautifulSoup(self.content)
		try:
			self.base = url_dirname(self.original_id)
		except TypeError:
			self.base = None
	
	def soup_teardown(self):
		self.soup 
		self.content = self.soup.prettify()
		
	def process(self):
		debug("item %s -> process()" % self.title)
		self.soup_setup()

		# process
		process.insert_alt_text(self.soup)
		self.download_images(need_soup = False)
		
		# save changes back as content
		self.soup_teardown()
	
	def download_images(self, need_soup=True):
		self.had_errors = False

		if need_soup:
			self.soup_setup()
		
		try: media = self.media
		except AttributeError: media = None

		if media is not None:
			success = process.insert_enclosure_images(self.soup, url_list = self.media)
			if not success:
				self.had_errors = True
		
		success = process.download_images(self.soup,
			dest_folder = self.resources_path,
			href_prefix = app_globals.CONFIG['resources_path'] + '/' + self.safe_google_id + '/',
			base_href = self.base)
		if not success:
			self.had_errors = True

		if need_soup:
			self.soup_teardown()
	
	def save(self):
		app_globals.DATABASE.add_item(self)

	def delete(self):
		app_globals.DATABASE.remove_item(self)
		for f in glob.glob(app_globals.OPTIONS['output_path'] + '/*.' + self.safe_google_id + '.*'):
			rm_rf(f)
		rm_rf(self.resources_path)

	def save_to_web(self):
		if not self.is_dirty:
			return
		
		# actions are effects to apply in order to ensure the web has been updated with our current state
		# i.e anything that the user *can* change must be set here
		actions = []
		# read status
		if self.is_read:
			actions.append(app_globals.READER.set_read)

		# stars
		if self.is_starred:
			actions.append(app_globals.READER.add_star)
		
		# share
		if self.is_shared:
			actions.append(app_globals.READER.add_public)

		# apply the actions
		for action in actions:
			Item.google_do_with_id(action, self.google_id)
		
		self.is_dirty = False

	@staticmethod
	def google_do_with_id(action, google_id):
		danger("Applying function %s to item %s" % (action, google_id))
		return action(google_id)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: item.py Proyecto: grvgr/google-reader-iphone-sync

class Item:
	"""
	A wrapper around a GoogleReader item
	"""
	def __init__(self, feed_item = None, tag_name = '(unknown)', raw_data = None):
		self.had_errors = False
		if feed_item is not None:
			try: self.feed_name = feed_item['feed_name']
			except (KeyError, TypeError):
				self.feed_name = tag_name
			self.tag_name = tag_name
			self.title = strip_html_tags(utf8(feed_item['title']))
			self.title = unicode(BeautifulSoup(self.title, convertEntities = BeautifulSoup.HTML_ENTITIES))
			self.google_id = feed_item['google_id']
			self.date = time.strftime('%Y%m%d%H%M%S', time.localtime(float(feed_item['updated'])))
			self.is_read = 'read' in feed_item['categories']
			self.is_starred = 'starred' in feed_item['categories']
			self.is_shared = 'broadcast' in feed_item['categories']
			self.url = utf8(feed_item['link'])
			self.content = utf8(feed_item['content'])
			self.original_id = utf8(feed_item['original_id'])
			self.media = try_lookup(feed_item, 'media')
			self.is_pagefeed = self.any_source_is_pagefeed(map(utf8, feed_item['sources']))
			self.instapaper_url = ""
			self.is_dirty = False
			self.is_stale = False
		else:
			# just copy the dict's keys to my instance vars
			for key,value in raw_data.items():
				setattr(self, key, value)
		
		# calculated attributes that aren't stored in the DB
		self.safe_google_id = Item.escape_google_id(self.google_id)
		self.resources_path = "%s/%s/%s" % (app_globals.OPTIONS['output_path'], app_globals.CONFIG['resources_path'], self.safe_google_id)
		self.basename = self.get_basename()
	
	@staticmethod
	def unescape_google_id(safe_google_id):
		return urllib.unquote(safe_google_id)

	@staticmethod
	def escape_google_id(unsafe_google_id):
		return urllib.quote(unsafe_google_id, safe='')

	def get_basename(self):
		"""A filesystem-safe key, unique to this item"""
		return utf8(
			self.date + ' ' +
			filter(lambda x: x not in '"\':#!+/$\\?*', ascii(self.title))[:120] + ' .||' +
			self.safe_google_id + '||' )

	def soup_setup(self):
		self.soup = BeautifulSoup(self.content)
		try:
			self.base = url_dirname(self.original_id)
		except TypeError:
			self.base = None
	
	def soup_teardown(self):
		self.soup 
		self.content = self.soup.prettify()
		
	def process(self):
		debug("item %s -> process()" % self.title)
		self.soup_setup()
		thread_pool.ping()
		
		# process
		debug("item %s -> insert_alt_text()" % self.title)
		process.insert_alt_text(self.soup)
		thread_pool.ping()
		
		self.download_images(need_soup = False)
		thread_pool.ping()
		
		# save changes back as content
		self.soup_teardown()
	
	def redownload_images(self):
		self.had_errors = False
		self.download_images()
		self.update()
	
	def download_images(self, need_soup=True):
		self.had_errors = False

		if need_soup:
			self.soup_setup()
		
		try: media = self.media
		except AttributeError: media = None

		if media is not None:
			success = process.insert_enclosure_images(self.soup, url_list = self.media)
			if not success:
				self.had_errors = True
		
		debug("item %s -> download_images()" % (self.title,))
		success = process.download_images(self.soup,
			dest_folder = self.resources_path,
			href_prefix = app_globals.CONFIG['resources_path'] + '/' + self.safe_google_id + '/',
			base_href = self.base)
		if not success:
			self.had_errors = True

		if need_soup:
			self.soup_teardown()
	
	def save(self):
		app_globals.DATABASE.add_item(self)
	
	def update(self):
		app_globals.DATABASE.update_content_for_item(self)

	def delete(self):
		app_globals.DATABASE.remove_item(self)
		for f in glob.glob(app_globals.OPTIONS['output_path'] + '/*.' + self.safe_google_id + '.*'):
			rm_rf(f)
		rm_rf(self.resources_path)
	
	def get_instpapaer_urls(self):
		return set(self.instapaper_url.split('|'))
	instapaper_urls = property(get_instpapaer_urls)
	
	def save_to_web(self):
		if not self.is_dirty:
			return
		
		# instapaper / pagefeed URLs
		if self.instapaper_url and len(self.instapaper_url) > 0:
			app_globals.URLSAVE.add_urls(self.instapaper_urls)
			self.instapaper_url = ''
		
		# read status
		if self.is_read:
			self._google_do(app_globals.READER.set_read)

		# stars
		if self.is_starred:
			self._google_do(app_globals.READER.add_star)
		
		# share
		if self.is_shared:
			self._google_do(app_globals.READER.add_public)
		
		self.delete_from_web_if_required()
		self.is_dirty = False

	def still_needed(self):
		is_unread = not self.is_read
		needed = is_unread or self.is_starred or self.is_shared
		return needed
	
	def any_source_is_pagefeed(self, sources):
		source_is_pagefeed = lambda source: source.startswith(app_globals.CONFIG['pagefeed_feed_url_prefix'])
		return any(map(source_is_pagefeed, sources))
	
	def delete_from_web_if_required(self):
		if (not self.is_pagefeed) or self.still_needed():
			return
		
		try:
			debug("deleting saved url: %s" % (self.url,))
			app_globals.URLSAVE.delete(url=self.url)
		except AttributeError:
			warning("url save mechanism has no delete function")
			return

	def _google_do(self, action):
		return action(self.google_id)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: doc.py Proyecto: ktisha/ebook-service

    def parse(self, output=""):
        self.temp_dir = tempfile.mkdtemp()
        if output == "":
            output = self.temp_dir + "/tmp.html"

        os.system(
            PATH_TO_WV_WARE
            + "wvWare -x"
            + PATH_TO_WV_HTML
            + "/wvHtml.xml --charset=cp1251 %s > %s" % (self.name, output)
        )

        paragraphs = []
        # temp_file = self.file.decode("utf-8")
        file = open(self.temp_dir + "/tmp.html", "r")
        temp_file = prettify.remove_spaces(file.read())
        temp_file = prettify.remove_unnecessary_tags(temp_file)
        soup = BeautifulSoup(temp_file)
        temp_names = soup.findAll(align="center")
        names = []
        titles = []
        for temp_name in temp_names:
            if not re.match(r"^(<.*?>|\s+)*$", str(temp_name)):
                names.append(re.sub(r"\s+", " ", str(temp_name)))
                temp = re.sub(r"(<.*>|\s+)", " ", temp_name.prettify())
                titles.append(re.sub(r"\s+", " ", temp))

        temp_file = re.sub(r"\s+", " ", temp_file.decode("cp1251").encode("utf-8"))
        out = open(self.temp_dir + "/tmp", "w")
        out.write(temp_file)
        out.write("   \n\n\n")
        for name in names:
            out.write(name + "\n\n\n")
        out.close()

        if not names:
            print "not names"
            file = open(self.temp_dir + "/0.html", "w")
            file.write(temp_file)
            file.close()
            self.book.add_file(self.temp_dir + "/0.html", "c0", "")
        for i, name in enumerate(names):
            split_index = temp_file.find(name)
            if i == 0:
                paragraph = ""
            else:
                paragraph = self.HTML_HEADER

            paragraph += temp_file[:split_index]
            soup = BeautifulSoup(paragraph)
            paragraph = soup.prettify()
            paragraphs.append(paragraph)
            temp_file = temp_file[split_index:]
            # soup = BeautifulSoup(temp_file)
            # temp_file = soup.prettify()
        for i, paragraph in enumerate(paragraphs):
            file = open(self.temp_dir + "/%d.html" % i, "w")
            file.write(paragraph)
            file.close()
            self.book.add_file(self.temp_dir + "/%d.html" % i, "c%d" % i, titles[i])
        # for i, image in enumerate(self.images):
        #    self.book.add_file(image, 'im%d' % i, title="", in_spine=False)

        self.book.pack()
        return True