def process_episode(number): audiofile = Settings.local_audio_filename(number) htmlfile = Settings.local_html_filename(number) # Make sure that we have the html file and (if desired) the mp3 file if Settings.CACHE_MP3S and not os.path.isfile(htmlfile): raise Exception("The HTML file for episode {0} is missing".format(number)) if not os.path.isfile(audiofile): raise Exception("The MP3 file for episode {0} is missing".format(number)) try: file_contents = open(htmlfile, 'r').read().decode('utf-8') soup = bs4.BeautifulSoup(file_contents) except Exception as e: print "Problem trying to read {0}".format(htmlfile) raise e try: # Get size of mp3 file # TODO: Come up with some way to get the size of the remote files filesize = os.path.getsize(audiofile) if Settings.CACHE_MP3S else 28000000 content_div = soup.find("div", {"id" : "content"}) if content_div is None: raise LookupError("Couldn't find a div named 'content_div'") acts = get_acts(soup) # Combine all act text into a single string. *Within* a single act, separate the # lines by newlines. *Between* acts, separate them by double-newlines # we might need to stick a '
' after each \n all_acts_text = '\n\n'.join(['===========================\n' + act['head'] + '\n' + act['body'] for act in acts]) # Start building our item item = ET.Element('item') # title tag title = ET.SubElement(item, 'title') title.text = get_raw_content(content_div, "h1", {"class" : "node-title"}) description = ET.SubElement(item, 'description') description.text = get_raw_content(content_div, "div", {"class" : "description"}) + '\n' + all_acts_text # pubDate tag # Dates in the html are in the form of "Dec 22, 1995". Parse them to turn them into the RFC format datestring = get_raw_content(content_div, "div", {"class" : "date"}) dateobj = datetime.strptime(datestring, "%b %d, %Y") pubDate = ET.SubElement(item, 'pubDate') pubDate.text = dateobj.strftime("%a, %d %b %Y 00:00:00 +0000") url = Settings.local_audio_url(number) if Settings.CACHE_MP3S else Settings.remote_audio_url(number) # link tag link = ET.SubElement(item, 'link') link.text = url # guid tag guid = ET.SubElement(item, 'guid') guid.text = url # enclosure tag (how to actually find the audio clip) enclosure = ET.SubElement(item, 'enclosure') enclosure.set('url',url) enclosure.set('length',str(filesize)) enclosure.set('type','audio/mpeg') # itunes:summary tag (this shows where the liner-notes or lyrics normally go) # summary = ET.SubElement(item, 'itunes:summary') # summary.text = all_acts_text # subtitle = ET.SubElement(item, 'itunes:subtitle') # subtitle.text = all_acts_text # resultset = soup.find_all("div", {"class", "act-body"}) # print "Acts: {0}".format(len(resultset)) return item except ValueError as e: print "Caught an error when trying to process episode {0}".format(number) raise Exception("Problem processing episode {0}".format(number))