Ejemplo n.º 1
0
def process_episode(number):
	audiofile = Settings.local_audio_filename(number)
	htmlfile = Settings.local_html_filename(number)

	# Make sure that we have the html file and (if desired) the mp3 file
	if Settings.CACHE_MP3S and not os.path.isfile(htmlfile):
		raise Exception("The HTML file for episode {0} is missing".format(number))
	if not os.path.isfile(audiofile):
		raise Exception("The MP3 file for episode {0} is missing".format(number))

	try:
		file_contents = open(htmlfile, 'r').read().decode('utf-8')
		soup = bs4.BeautifulSoup(file_contents)

	except Exception as e:
		print "Problem trying to read {0}".format(htmlfile)
		raise e

	try:
		# Get size of mp3 file
		# TODO: Come up with some way to get the size of the remote files
		filesize = os.path.getsize(audiofile) if Settings.CACHE_MP3S else 28000000
	
		content_div = soup.find("div", {"id" : "content"})
		if content_div is None:
			raise LookupError("Couldn't find a div named 'content_div'")

		acts = get_acts(soup)
		# Combine all act text into a single string. *Within* a single act, separate the
		# lines by newlines. *Between* acts, separate them by double-newlines
#		we might need to stick a '
' after each \n
		all_acts_text = '\n\n'.join(['===========================\n' + act['head'] + '\n' + act['body'] for act in acts])

		# Start building our item
		item = ET.Element('item')
	
		# title tag
		title = ET.SubElement(item, 'title')
		title.text = get_raw_content(content_div, "h1", {"class" : "node-title"})
	
		description = ET.SubElement(item, 'description')
		description.text = get_raw_content(content_div, "div", {"class" : "description"}) + '\n' + all_acts_text
	
		# pubDate tag
		# Dates in the html are in the form of "Dec 22, 1995". Parse them to turn them into the RFC format
		datestring =  get_raw_content(content_div, "div", {"class" : "date"})
		dateobj = datetime.strptime(datestring, "%b %d, %Y")
		pubDate = ET.SubElement(item, 'pubDate')
		pubDate.text = dateobj.strftime("%a, %d %b %Y 00:00:00 +0000")

		url = Settings.local_audio_url(number) if Settings.CACHE_MP3S else Settings.remote_audio_url(number)
	
		# link tag	
		link = ET.SubElement(item, 'link')
		link.text = url
	
		# guid tag
		guid = ET.SubElement(item, 'guid')
		guid.text = url

		# enclosure tag (how to actually find the audio clip)	
		enclosure = ET.SubElement(item, 'enclosure')
		enclosure.set('url',url)
		enclosure.set('length',str(filesize))
		enclosure.set('type','audio/mpeg')

		# itunes:summary tag (this shows where the liner-notes or lyrics normally go)	
#		summary = ET.SubElement(item, 'itunes:summary')
#		summary.text = all_acts_text
#		subtitle = ET.SubElement(item, 'itunes:subtitle')
#		subtitle.text = all_acts_text
		
#		resultset = soup.find_all("div", {"class", "act-body"})
#		print "Acts: {0}".format(len(resultset))
	
		return item
	except ValueError as e:
		print "Caught an error when trying to process episode {0}".format(number)
		raise Exception("Problem processing episode {0}".format(number))