def extract_urls(client, content, encoding, categs): import feedparser log.info("Parsing feed content") data = feedparser.parse(content) for entry in data.entries: url, title = entry.link, entry.title if "content" in entry: content = entry.content[0].value else: content = "" log.info("Adding & indexing: '%s'" % url) if "updated_parsed" not in entry or entry.updated_parsed is None: continue t = time.mktime(entry.updated_parsed) data = [ db.Mutation(column="Content:raw", value=smart_str(content, encoding)), db.Mutation(column="Content:title", value=smart_str(title, encoding)), db.Mutation(column="Meta:updated", value=str(t)), ] client.mutateRow("Urls", url, data) parts = set(filter(None, [x.strip() for x in categs.split(",")])) parts.add("__all__") for cat in parts: row = build_key(cat, t, url, client, collision_check=True) client.mutateRow("UrlsIndex", row, [db.Mutation(column="Url", value=smart_str(url))])
def POST(self, url, data=None): request_param = self._request_param.copy() if data: request_param.update(data) response = requests.post(url, request_param) if response.status_code != 200: raise 'request error_code: %s' % response.status_code response = response.json() if response['status']['code'] != '1': raise DNSPodError, smart_str(response['status']['message']) return response
def fetch(feed): """ Fetch feed and detect charset. The return result is a byte string and the encoding information. """ from urllib import urlopen import chardet log.info("Fetching feed '%s'" % feed) content = urlopen(feed).read() d = chardet.detect(content) log.info("Detected charset: %s" % d["encoding"]) return smart_str(content, d["encoding"], "replace"), str(d["encoding"])
def generate(self): output_dir = os.path.join(cfg.OUTPUT_DIR, self.direc) if not os.path.exists(output_dir): log.info("Can't find %s directory. Skipping generate." % output_dir) return print "Generating %s" % output_dir lopt = { "owner_name": self.user, "owner_email": self.email, "title": self.name, "feed_url": "%s%s/atom.xml" % (cfg.BASE_HREF, self.direc), "opml_url": "%s%s/opml.xml" % (cfg.BASE_HREF, self.direc), "feed_page": "%s%s/" % (cfg.BASE_HREF, self.direc), "updated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(self.last_downloaded)), "date": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "datemodified": time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(self.last_downloaded)), } ## Get the entries and sort them entries = {} lopt["Feeds"] = [] for url, f in self.feeds.items(): with our_db("cache") as db: if not url in db: continue try: # cache = db[url.encode("utf-8")] cache = db[url] except json.decoder.JSONDecodeError, e: log.debug("Json error on generating url %s: %s" % (url, e)) continue parsed = cache["data"] if not parsed or not parsed["entries"]: log.debug("No data for %s. Skipping." % url) continue for e in parsed["entries"]: e["name"] = f["name"] e["links"] = parsed["feed"]["links"] e["feed_name"] = smart_str(parsed["feed"]["title"], encoding="ascii", errors="ignore") e["channel_title_plain"] = e["feed_name"] e["channel_image"] = f["image"] e["channel_name"] = e["feed_name"] if "subtitle" in parsed["feed"]: e["subtitle"] = parsed["feed"]["subtitle"] else: e["subtitle"] = "" if parsed["feed"]["link"].endswith("/"): e["channel_link"] = e["feed_id"] = parsed["feed"]["link"] else: e["channel_link"] = e["feed_id"] = parsed["feed"]["link"] + "/" if "updated" in e: e["date"] = dateutil.parser.parse(e["updated"]).strftime("%Y-%m-%d %H:%M:%S") e["updated"] = dateutil.parser.parse(e["updated"]).isoformat() elif "published_parsed" in e: e["date"] = dateutil.parser.parse(e["published_parsed"]["__value__"]).strftime("%Y-%m-%d %H:%M:%S") e["updated"] = dateutil.parser.parse(e["published_parsed"]["__value__"]).isoformat() else: e["date"] = e["updated"] = "1970-01-01T00:00:00Z" # We really should assume the blog post is from when it is first seen for lack of a better option # e['date'] = e['updated'] = datetime.now().strftime("%Y-%m-%dT%H:00Z") log.debug("No updated or date field in entry for %s" % url) # pretty_print_dict(e) if not "id" in e: e["id"] = e["link"] if not "link" in e: e["link"] = e["id"] if not e["id"] and not e["link"]: log.debug("%s has neither id nor link" % e["feed_name"]) entries[e["id"]] = e ## OPML template stuff and sidebar stuff feed_data = {} for l in parsed["feed"]["links"]: if not "type" in l: l["type"] = "text/html" if l["rel"] == "self": feed_data["url"] = l["href"] elif l["rel"] == "alternate": if "href" in l: feed_data["link"] = l["href"] feed_data["author"] = f["name"] feed_data["title"] = smart_str(parsed["feed"]["title"], encoding="ascii", errors="ignore") feed_data["image"] = f["image"] if "feedurl" in f: feed_data["url"] = f["feedurl"] else: log.error("%s is missing the feedurl key. Falling back to url" % url) feed_data["url"] = f["url"] lopt["Feeds"].append(feed_data)
def _encode_data(self, data): """ Encode string data. """ return util.smart_str(data, self.charset)
def generate(self): output_dir = os.path.join(cfg.OUTPUT_DIR, self.direc) if not os.path.exists(output_dir): log.info("Can't find %s directory. Skipping generate." % output_dir) return print "Generating %s" % output_dir lopt = {'owner_name':self.user, 'owner_email':self.email, 'title':self.name, 'feed_url':"%s%s/atom.xml" % (cfg.BASE_HREF, self.direc), 'opml_url':"%s%s/opml.xml" % (cfg.BASE_HREF, self.direc), 'feed_page':"%s%s/" % (cfg.BASE_HREF, self.direc), 'updated':time.strftime("%Y-%m-%dT%H:%M:%SZ",time.gmtime(self.last_downloaded)), 'date':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()), 'datemodified':time.strftime("%a, %d %b %Y %H:%M:%S GMT",time.gmtime(self.last_downloaded)), } ## Get the entries and sort them entries = {} lopt['Feeds']=[] for url, f in self.feeds.items(): with our_db('cache') as db: if not url in db: continue try: cache = db[url] except json.decoder.JSONDecodeError, e: log.debug("Json error on generating url %s: %s" % (url, e)) continue parsed = cache['data'] if not parsed or not parsed['entries']: log.debug("No data for %s. Skipping." % url) continue for e in parsed['entries']: e['name'] = f['name'] if 'links' in parsed['feed']: e['links'] = parsed['feed']['links'] else: e['links'] = [] if 'title' in parsed['feed']: e['feed_name'] = smart_str(parsed['feed']['title'], encoding='ascii', errors='ignore') else: e['feed_name'] = f['name'] e['channel_title_plain'] = e['feed_name'] e['channel_image'] = f['image'] e['channel_name'] = e['feed_name'] if 'subtitle' in parsed['feed']: e['subtitle'] = parsed['feed']['subtitle'] else: e['subtitle']='' if 'link' in parsed['feed']: if parsed['feed']['link'].endswith('/'): e['channel_link'] = e['feed_id'] = parsed['feed']['link'] else: e['channel_link'] = e['feed_id'] = parsed['feed']['link']+'/' else: e['channel_link'] = e['feed_id'] = f['feedurl'] if 'updated' in e: e['date'] = dateutil.parser.parse(e['updated']).strftime("%Y-%m-%d %H:%M:%S") e['updated'] = dateutil.parser.parse(e['updated']).isoformat() elif 'published_parsed' in e: if e['published_parsed'] is None: log.debug('%s has published date that could not be parsed' % e['feed_name']) else: if len(e['published_parsed']) == 9: e['date'] = time.strftime("%Y-%m-%d %H:%M:%S", e['published_parsed']) e['updated'] = datetime.date.fromtimestamp(time.mktime(e['published_parsed'])).isoformat() else: e['date'] = dateutil.parser.parse(e['published_parsed']['__value__']).strftime("%Y-%m-%d %H:%M:%S") e['updated'] = dateutil.parser.parse(e['published_parsed']['__value__']).isoformat() else: e['date'] = e['updated'] = '1970-01-01T00:00:00Z' # We really should assume the blog post is from when it is first seen for lack of a better option #e['date'] = e['updated'] = datetime.now().strftime("%Y-%m-%dT%H:00Z") log.debug("No updated or date field in entry for %s" % url) #pretty_print_dict(e) if not 'id' in e: e['id'] = e['link'] if not 'link' in e: e['link'] = e['id'] if not e['id'] and not e['link']: log.debug('%s has neither id nor link' % e['feed_name']) entries[e['id']] = e ## OPML template stuff and sidebar stuff feed_data = {} # Default these to the feed itself if 'feedurl' in f: feed_data['url'] = f['feedurl'] feed_data['link'] = f['feedurl'] for l in e['links']: if not 'type' in l: l['type']='text/html' if l['rel']=="self": feed_data['url'] = l['href'] elif l['rel']=="alternate": if 'href' in l: feed_data['link'] = l['href'] feed_data['author'] = f['name'] if 'title' in parsed['feed']: feed_data['title'] = smart_str(parsed['feed']['title'], encoding='ascii', errors='ignore') else: feed_data['title'] = f['name'] feed_data['image'] = f['image'] if 'feedurl' in f: feed_data['url'] = f['feedurl'] else: log.error("%s is missing the feedurl key. Falling back to url" % url) feed_data['url'] = f['url'] lopt['Feeds'].append(feed_data)
def generate(self): output_dir = os.path.join(cfg.OUTPUT_DIR, self.direc) if not os.path.exists(output_dir): log.info("Can't find %s directory. Skipping generate." % output_dir) return print "Generating %s" % output_dir lopt = {'owner_name':self.user, 'title':self.name, 'feed_url':"%s%s/atom.xml" % (cfg.BASE_HREF, self.direc), 'opml_url':"%s%s/opml.xml" % (cfg.BASE_HREF, self.direc), 'feed_page':"%s%s/" % (cfg.BASE_HREF, self.direc), 'updated':time.strftime("%Y-%m-%dT%H:%M:%SZ",time.gmtime(self.last_downloaded)), 'date':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()), 'datemodified':time.strftime("%a, %d %b %Y %H:%M:%S GMT",time.gmtime(self.last_downloaded)), } ## Get the entries and sort them entries = {} lopt['Feeds']=[] for url, f in self.feeds.items(): with our_db('cache') as db: if not url in db: continue try: #cache = db[url.encode("utf-8")] cache = db[url] except json.decoder.JSONDecodeError, e: log.debug("Json error on generating url %s: %s" % (url, e)) continue parsed = cache['data'] if not parsed or not parsed['entries']: log.debug("No data for %s. Skipping." % url) continue for e in parsed['entries']: e['name'] = f['name'] e['links'] = parsed['feed']['links'] e['feed_name'] = smart_str(parsed['feed']['title'], encoding='ascii', errors='ignore') e['channel_title_plain'] = e['feed_name'] e['channel_image'] = f['image'] e['channel_name'] = e['feed_name'] if 'subtitle' in parsed['feed']: e['subtitle'] = parsed['feed']['subtitle'] else: e['subtitle']='' if parsed['feed']['link'].endswith('/'): e['channel_link'] = e['feed_id'] = parsed['feed']['link'] else: e['channel_link'] = e['feed_id'] = parsed['feed']['link']+'/' try: e['date'] = dateutil.parser.parse(e['updated']).strftime("%Y-%m-%d %H:%M:%S") e['updated'] = dateutil.parser.parse(e['updated']).isoformat() except KeyError: e['date'] = e['updated'] = '1970-01-01T00:00:00Z' log.debug("No updated field in entry for %s" % url) if not 'id' in e: e['id'] = e['link'] if not 'link' in e: e['link'] = e['id'] if not e['id'] and not e['link']: log.debug('%s has neither id nor link' % e['feed_name']) entries[e['id']] = e ## OPML template stuff and sidebar stuff feed_data = {} for l in parsed['feed']['links']: if not 'type' in l: l['type']='text/html' if l['rel']=="self": feed_data['url'] = l['href'] elif l['rel']=="alternate": if 'href' in l: feed_data['link'] = l['href'] feed_data['author'] = f['name'] feed_data['title'] = smart_str(parsed['feed']['title'], encoding='ascii', errors='ignore') feed_data['image'] = f['image'] if 'feedurl' in f: feed_data['url'] = f['feedurl'] else: log.error("%s is missing the feedurl key. Falling back to url" % url) feed_data['url'] = f['url'] lopt['Feeds'].append(feed_data)