def seturl(self): '''URLとURLからフェッチして保存します''' user = common.currentuser() if not user: common.error(self, 404, "User not found.") return ct = models.CustomTest.all().ancestor(user).get() if not ct: ct = models.CustomTest(parent=user) ct.setbypost(self.request.POST) if not ct.rss_link: soup = Soup(defines.defaulttesthtml) else: result = urlfetch.fetch(ct.rss_link) if result.status_code != 200: common.error(self, 200, "Url Fetch Error") return soup = Soup(result.content) try: ct.data = soup.prettify().decode('UTF-8') except ValueError, message: common.error(self, 200, message) return
def get(self): self.response.headers['Content-Type'] = 'text/plain' day = date.today() - relativedelta(days=1) response = fetch_url("http://podcastrss.play.it/the-sports-junkies_mp3_128.xml") if response and response.status_code == 200: feed_soup = BeautifulSoup(response.content) [copyright_el.extract() for copyright_el in feed_soup.findAll("copyright")] self.response.out.write("%s\n\n\n" % feed_soup.prettify()) DailyFeedSnapshot.create(day, feed_soup.prettify()) msg = "Created a DailyFeedSnapshot for %s." % (day) self.response.out.write(msg) logging.info(msg) else: msg = "Could not create a DailyFeedSnapshot for %s." % (day) self.response.out.write(msg) logging.error(msg)
def create_book(self): if self.fail: return False paragraphs = [] temp_file = self.file.decode("utf-8") # temp_file = self.__erase_xml_illegal_chars(temp_file) if not "temp_dir" in dir(self): self.temp_dir = tempfile.mkdtemp() if not self.names: file = open(self.temp_dir + "/0.html", 'w') file.write(self.file) file.close() os.system(EPUBER_DIR + '/remove_illegal.py <' + self.temp_dir + "/0.html >" + self.temp_dir + "/tmp") shutil.move(self.temp_dir + "/tmp", self.temp_dir + "/0.html") self.book.add_file(self.temp_dir + "/0.html", 'c0', "") else: for i, name in enumerate(self.names): split_index = temp_file.find(name) if i == 0: paragraph = "" else: paragraph = self.HTML_HEADER paragraph += temp_file[:split_index] soup = BeautifulSoup(paragraph) paragraph = soup.prettify() paragraphs.append(paragraph) temp_file = temp_file[split_index:] #soup = BeautifulSoup(temp_file) #temp_file = soup.prettify() paragraphs.append(BeautifulSoup(self.HTML_HEADER + temp_file).prettify()) for i, paragraph in enumerate(paragraphs): file = open(self.temp_dir + "/%d.html" % i, 'w') file.write(paragraph) file.close() os.system(EPUBER_DIR + '/remove_illegal.py <' + self.temp_dir + "/%d.html >" % i + self.temp_dir + "/tmp") shutil.move(self.temp_dir + "/tmp", self.temp_dir + "/%d.html" % i) self.book.add_file(self.temp_dir + "/%d.html" % i, 'c%d' % i, self.titles[i]) for i, image in enumerate(self.images): self.book.add_file(image, self.temp_dir + '/im%d' % i, title="", in_spine=False) self.book.pack() return True
def parse_matchup_info(league, team_id): logging.info("team_id: %d" % team_id) generic_matchup_url = build_url(league_id=league.id, page='matchup', params={'mid1': team_id}, access_code=league.access_code) try: matchup_soup = BeautifulSoup(urlfetch.fetch(generic_matchup_url).content).find('div', attrs={'class': 'scoreboard'}).find('li') except: matchup_soup = None logging.info("\n\n\n%s\n\n\n" % matchup_soup.prettify()) if matchup_soup: team_names = [str(row.find('a').contents[0]).strip() for row in matchup_soup.findAll('tr')] score = [float(pts.contents[0]) for pts in matchup_soup.findAll('td', attrs={'class': 'pts'})] else: team_names = None score = None if team_names and score: return {'score': score, 'team_names': team_names} else: return None
class Item: """ A wrapper around a GoogleReader item """ def __init__(self, feed_item = None, tag_name = '(unknown)', raw_data = None): self.had_errors = False if feed_item is not None: try: self.feed_name = feed_item['feed_name'] except (KeyError, TypeError): self.feed_name = tag_name self.tag_name = tag_name self.title = strip_html_tags(feed_item['title']) self.title = unicode(BeautifulSoup(self.title, convertEntities = BeautifulSoup.HTML_ENTITIES)) self.google_id = feed_item['google_id'] self.date = time.strftime('%Y%m%d%H%M%S', time.localtime(float(feed_item['updated']))) self.is_read = 'read' in feed_item['categories'] self.is_starred = 'starred' in feed_item['categories'] self.is_shared = 'broadcast' in feed_item['categories'] self.url = feed_item['link'] self.content = feed_item['content'] self.original_id = feed_item['original_id'] self.media = try_lookup(feed_item, 'media') self.is_dirty = False self.is_stale = False else: # just copy the dict's keys to my instance vars for key,value in raw_data.items(): setattr(self, key, value) # calculated attributes that aren't stored in the DB self.safe_google_id = Item.escape_google_id(self.google_id) self.resources_path = "%s/%s/%s" % (app_globals.OPTIONS['output_path'], app_globals.CONFIG['resources_path'], self.safe_google_id) self.basename = self.get_basename() @staticmethod def unescape_google_id(safe_google_id): return urllib.unquote(safe_google_id) @staticmethod def escape_google_id(unsafe_google_id): return urllib.quote(unsafe_google_id, safe='') def get_basename(self): return utf8( self.date + ' ' + filter(lambda x: x not in '"\':#!+/$\\?*', ascii(self.title))[:120] + ' .||' + self.safe_google_id + '||' ) def soup_setup(self): self.soup = BeautifulSoup(self.content) try: self.base = url_dirname(self.original_id) except TypeError: self.base = None def soup_teardown(self): self.soup self.content = self.soup.prettify() def process(self): debug("item %s -> process()" % self.title) self.soup_setup() # process process.insert_alt_text(self.soup) self.download_images(need_soup = False) # save changes back as content self.soup_teardown() def download_images(self, need_soup=True): self.had_errors = False if need_soup: self.soup_setup() try: media = self.media except AttributeError: media = None if media is not None: success = process.insert_enclosure_images(self.soup, url_list = self.media) if not success: self.had_errors = True success = process.download_images(self.soup, dest_folder = self.resources_path, href_prefix = app_globals.CONFIG['resources_path'] + '/' + self.safe_google_id + '/', base_href = self.base) if not success: self.had_errors = True if need_soup: self.soup_teardown() def save(self): app_globals.DATABASE.add_item(self) def delete(self): app_globals.DATABASE.remove_item(self) for f in glob.glob(app_globals.OPTIONS['output_path'] + '/*.' + self.safe_google_id + '.*'): rm_rf(f) rm_rf(self.resources_path) def save_to_web(self): if not self.is_dirty: return # actions are effects to apply in order to ensure the web has been updated with our current state # i.e anything that the user *can* change must be set here actions = [] # read status if self.is_read: actions.append(app_globals.READER.set_read) # stars if self.is_starred: actions.append(app_globals.READER.add_star) # share if self.is_shared: actions.append(app_globals.READER.add_public) # apply the actions for action in actions: Item.google_do_with_id(action, self.google_id) self.is_dirty = False @staticmethod def google_do_with_id(action, google_id): danger("Applying function %s to item %s" % (action, google_id)) return action(google_id)
class Item: """ A wrapper around a GoogleReader item """ def __init__(self, feed_item = None, tag_name = '(unknown)', raw_data = None): self.had_errors = False if feed_item is not None: try: self.feed_name = feed_item['feed_name'] except (KeyError, TypeError): self.feed_name = tag_name self.tag_name = tag_name self.title = strip_html_tags(utf8(feed_item['title'])) self.title = unicode(BeautifulSoup(self.title, convertEntities = BeautifulSoup.HTML_ENTITIES)) self.google_id = feed_item['google_id'] self.date = time.strftime('%Y%m%d%H%M%S', time.localtime(float(feed_item['updated']))) self.is_read = 'read' in feed_item['categories'] self.is_starred = 'starred' in feed_item['categories'] self.is_shared = 'broadcast' in feed_item['categories'] self.url = utf8(feed_item['link']) self.content = utf8(feed_item['content']) self.original_id = utf8(feed_item['original_id']) self.media = try_lookup(feed_item, 'media') self.is_pagefeed = self.any_source_is_pagefeed(map(utf8, feed_item['sources'])) self.instapaper_url = "" self.is_dirty = False self.is_stale = False else: # just copy the dict's keys to my instance vars for key,value in raw_data.items(): setattr(self, key, value) # calculated attributes that aren't stored in the DB self.safe_google_id = Item.escape_google_id(self.google_id) self.resources_path = "%s/%s/%s" % (app_globals.OPTIONS['output_path'], app_globals.CONFIG['resources_path'], self.safe_google_id) self.basename = self.get_basename() @staticmethod def unescape_google_id(safe_google_id): return urllib.unquote(safe_google_id) @staticmethod def escape_google_id(unsafe_google_id): return urllib.quote(unsafe_google_id, safe='') def get_basename(self): """A filesystem-safe key, unique to this item""" return utf8( self.date + ' ' + filter(lambda x: x not in '"\':#!+/$\\?*', ascii(self.title))[:120] + ' .||' + self.safe_google_id + '||' ) def soup_setup(self): self.soup = BeautifulSoup(self.content) try: self.base = url_dirname(self.original_id) except TypeError: self.base = None def soup_teardown(self): self.soup self.content = self.soup.prettify() def process(self): debug("item %s -> process()" % self.title) self.soup_setup() thread_pool.ping() # process debug("item %s -> insert_alt_text()" % self.title) process.insert_alt_text(self.soup) thread_pool.ping() self.download_images(need_soup = False) thread_pool.ping() # save changes back as content self.soup_teardown() def redownload_images(self): self.had_errors = False self.download_images() self.update() def download_images(self, need_soup=True): self.had_errors = False if need_soup: self.soup_setup() try: media = self.media except AttributeError: media = None if media is not None: success = process.insert_enclosure_images(self.soup, url_list = self.media) if not success: self.had_errors = True debug("item %s -> download_images()" % (self.title,)) success = process.download_images(self.soup, dest_folder = self.resources_path, href_prefix = app_globals.CONFIG['resources_path'] + '/' + self.safe_google_id + '/', base_href = self.base) if not success: self.had_errors = True if need_soup: self.soup_teardown() def save(self): app_globals.DATABASE.add_item(self) def update(self): app_globals.DATABASE.update_content_for_item(self) def delete(self): app_globals.DATABASE.remove_item(self) for f in glob.glob(app_globals.OPTIONS['output_path'] + '/*.' + self.safe_google_id + '.*'): rm_rf(f) rm_rf(self.resources_path) def get_instpapaer_urls(self): return set(self.instapaper_url.split('|')) instapaper_urls = property(get_instpapaer_urls) def save_to_web(self): if not self.is_dirty: return # instapaper / pagefeed URLs if self.instapaper_url and len(self.instapaper_url) > 0: app_globals.URLSAVE.add_urls(self.instapaper_urls) self.instapaper_url = '' # read status if self.is_read: self._google_do(app_globals.READER.set_read) # stars if self.is_starred: self._google_do(app_globals.READER.add_star) # share if self.is_shared: self._google_do(app_globals.READER.add_public) self.delete_from_web_if_required() self.is_dirty = False def still_needed(self): is_unread = not self.is_read needed = is_unread or self.is_starred or self.is_shared return needed def any_source_is_pagefeed(self, sources): source_is_pagefeed = lambda source: source.startswith(app_globals.CONFIG['pagefeed_feed_url_prefix']) return any(map(source_is_pagefeed, sources)) def delete_from_web_if_required(self): if (not self.is_pagefeed) or self.still_needed(): return try: debug("deleting saved url: %s" % (self.url,)) app_globals.URLSAVE.delete(url=self.url) except AttributeError: warning("url save mechanism has no delete function") return def _google_do(self, action): return action(self.google_id)
def parse(self, output=""): self.temp_dir = tempfile.mkdtemp() if output == "": output = self.temp_dir + "/tmp.html" os.system( PATH_TO_WV_WARE + "wvWare -x" + PATH_TO_WV_HTML + "/wvHtml.xml --charset=cp1251 %s > %s" % (self.name, output) ) paragraphs = [] # temp_file = self.file.decode("utf-8") file = open(self.temp_dir + "/tmp.html", "r") temp_file = prettify.remove_spaces(file.read()) temp_file = prettify.remove_unnecessary_tags(temp_file) soup = BeautifulSoup(temp_file) temp_names = soup.findAll(align="center") names = [] titles = [] for temp_name in temp_names: if not re.match(r"^(<.*?>|\s+)*$", str(temp_name)): names.append(re.sub(r"\s+", " ", str(temp_name))) temp = re.sub(r"(<.*>|\s+)", " ", temp_name.prettify()) titles.append(re.sub(r"\s+", " ", temp)) temp_file = re.sub(r"\s+", " ", temp_file.decode("cp1251").encode("utf-8")) out = open(self.temp_dir + "/tmp", "w") out.write(temp_file) out.write(" \n\n\n") for name in names: out.write(name + "\n\n\n") out.close() if not names: print "not names" file = open(self.temp_dir + "/0.html", "w") file.write(temp_file) file.close() self.book.add_file(self.temp_dir + "/0.html", "c0", "") for i, name in enumerate(names): split_index = temp_file.find(name) if i == 0: paragraph = "" else: paragraph = self.HTML_HEADER paragraph += temp_file[:split_index] soup = BeautifulSoup(paragraph) paragraph = soup.prettify() paragraphs.append(paragraph) temp_file = temp_file[split_index:] # soup = BeautifulSoup(temp_file) # temp_file = soup.prettify() for i, paragraph in enumerate(paragraphs): file = open(self.temp_dir + "/%d.html" % i, "w") file.write(paragraph) file.close() self.book.add_file(self.temp_dir + "/%d.html" % i, "c%d" % i, titles[i]) # for i, image in enumerate(self.images): # self.book.add_file(image, 'im%d' % i, title="", in_spine=False) self.book.pack() return True