Ejemplo n.º 1
0
def extract_urls(client, content, encoding, categs):
    import feedparser

    log.info("Parsing feed content")
    data = feedparser.parse(content)
    for entry in data.entries:
        url, title = entry.link, entry.title
        if "content" in entry:
            content = entry.content[0].value
        else:
            content = ""

        log.info("Adding & indexing: '%s'" % url)
        if "updated_parsed" not in entry or entry.updated_parsed is None:
            continue
        t = time.mktime(entry.updated_parsed)
        data = [
            db.Mutation(column="Content:raw", value=smart_str(content, encoding)),
            db.Mutation(column="Content:title", value=smart_str(title, encoding)),
            db.Mutation(column="Meta:updated", value=str(t)),
        ]
        client.mutateRow("Urls", url, data)

        parts = set(filter(None, [x.strip() for x in categs.split(",")]))
        parts.add("__all__")
        for cat in parts:
            row = build_key(cat, t, url, client, collision_check=True)
            client.mutateRow("UrlsIndex", row, [db.Mutation(column="Url", value=smart_str(url))])
Ejemplo n.º 2
0
    def POST(self, url, data=None):
        request_param = self._request_param.copy()
        if data: request_param.update(data)
        response = requests.post(url, request_param)
        if response.status_code != 200:
            raise 'request error_code: %s' % response.status_code

        response = response.json()
        if response['status']['code'] != '1':
            raise DNSPodError, smart_str(response['status']['message'])

        return response
Ejemplo n.º 3
0
def fetch(feed):
    """
    Fetch feed and detect charset. The return result is a  byte string and
    the encoding information.
    """
    from urllib import urlopen
    import chardet

    log.info("Fetching feed '%s'" % feed)
    content = urlopen(feed).read()

    d = chardet.detect(content)
    log.info("Detected charset: %s" % d["encoding"])

    return smart_str(content, d["encoding"], "replace"), str(d["encoding"])
Ejemplo n.º 4
0
    def generate(self):
        output_dir = os.path.join(cfg.OUTPUT_DIR, self.direc)
        if not os.path.exists(output_dir):
            log.info("Can't find %s directory.  Skipping generate." % output_dir)
            return
        print "Generating %s" % output_dir
        lopt = {
            "owner_name": self.user,
            "owner_email": self.email,
            "title": self.name,
            "feed_url": "%s%s/atom.xml" % (cfg.BASE_HREF, self.direc),
            "opml_url": "%s%s/opml.xml" % (cfg.BASE_HREF, self.direc),
            "feed_page": "%s%s/" % (cfg.BASE_HREF, self.direc),
            "updated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(self.last_downloaded)),
            "date": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "datemodified": time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(self.last_downloaded)),
        }

        ## Get the entries and sort them
        entries = {}
        lopt["Feeds"] = []
        for url, f in self.feeds.items():
            with our_db("cache") as db:
                if not url in db:
                    continue
                try:
                    # cache = db[url.encode("utf-8")]
                    cache = db[url]
                except json.decoder.JSONDecodeError, e:
                    log.debug("Json error on generating url %s: %s" % (url, e))
                    continue

            parsed = cache["data"]
            if not parsed or not parsed["entries"]:
                log.debug("No data for %s.  Skipping." % url)
                continue

            for e in parsed["entries"]:
                e["name"] = f["name"]
                e["links"] = parsed["feed"]["links"]
                e["feed_name"] = smart_str(parsed["feed"]["title"], encoding="ascii", errors="ignore")
                e["channel_title_plain"] = e["feed_name"]
                e["channel_image"] = f["image"]
                e["channel_name"] = e["feed_name"]
                if "subtitle" in parsed["feed"]:
                    e["subtitle"] = parsed["feed"]["subtitle"]
                else:
                    e["subtitle"] = ""
                if parsed["feed"]["link"].endswith("/"):
                    e["channel_link"] = e["feed_id"] = parsed["feed"]["link"]
                else:
                    e["channel_link"] = e["feed_id"] = parsed["feed"]["link"] + "/"

                if "updated" in e:
                    e["date"] = dateutil.parser.parse(e["updated"]).strftime("%Y-%m-%d %H:%M:%S")
                    e["updated"] = dateutil.parser.parse(e["updated"]).isoformat()
                elif "published_parsed" in e:
                    e["date"] = dateutil.parser.parse(e["published_parsed"]["__value__"]).strftime("%Y-%m-%d %H:%M:%S")
                    e["updated"] = dateutil.parser.parse(e["published_parsed"]["__value__"]).isoformat()
                else:
                    e["date"] = e["updated"] = "1970-01-01T00:00:00Z"
                    # We really should assume the blog post is from when it is first seen for lack of a better option
                    # e['date'] = e['updated'] = datetime.now().strftime("%Y-%m-%dT%H:00Z")
                    log.debug("No updated or date field in entry for %s" % url)
                    # pretty_print_dict(e)
                if not "id" in e:
                    e["id"] = e["link"]
                if not "link" in e:
                    e["link"] = e["id"]
                if not e["id"] and not e["link"]:
                    log.debug("%s has neither id nor link" % e["feed_name"])
                entries[e["id"]] = e

            ## OPML template stuff and sidebar stuff
            feed_data = {}
            for l in parsed["feed"]["links"]:
                if not "type" in l:
                    l["type"] = "text/html"
                if l["rel"] == "self":
                    feed_data["url"] = l["href"]
                elif l["rel"] == "alternate":
                    if "href" in l:
                        feed_data["link"] = l["href"]
            feed_data["author"] = f["name"]
            feed_data["title"] = smart_str(parsed["feed"]["title"], encoding="ascii", errors="ignore")
            feed_data["image"] = f["image"]
            if "feedurl" in f:
                feed_data["url"] = f["feedurl"]
            else:
                log.error("%s is missing the feedurl key.  Falling back to url" % url)
                feed_data["url"] = f["url"]
            lopt["Feeds"].append(feed_data)
Ejemplo n.º 5
0
 def _encode_data(self, data):
     """ Encode string data. """
     return util.smart_str(data, self.charset)
Ejemplo n.º 6
0
   def generate(self):
      output_dir = os.path.join(cfg.OUTPUT_DIR, self.direc)
      if not os.path.exists(output_dir):
         log.info("Can't find %s directory.  Skipping generate." % output_dir)
         return
      print "Generating %s" % output_dir
      lopt = {'owner_name':self.user,
              'owner_email':self.email,
              'title':self.name,
              'feed_url':"%s%s/atom.xml" % (cfg.BASE_HREF, self.direc),
              'opml_url':"%s%s/opml.xml" % (cfg.BASE_HREF, self.direc),
              'feed_page':"%s%s/" % (cfg.BASE_HREF, self.direc),
              'updated':time.strftime("%Y-%m-%dT%H:%M:%SZ",time.gmtime(self.last_downloaded)),
              'date':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),
              'datemodified':time.strftime("%a, %d %b %Y %H:%M:%S GMT",time.gmtime(self.last_downloaded)),
              }

      ## Get the entries and sort them
      entries = {}
      lopt['Feeds']=[]
      for url, f in self.feeds.items():
         with our_db('cache') as db:
            if not url in db:
               continue
            try:
               cache = db[url]
            except json.decoder.JSONDecodeError, e:
               log.debug("Json error on generating url %s: %s" % (url, e))
               continue

         parsed = cache['data']
         if not parsed or not parsed['entries']:
            log.debug("No data for %s.  Skipping." % url)
            continue
         
         for e in parsed['entries']:
            e['name'] = f['name']
            if 'links' in parsed['feed']:
               e['links'] = parsed['feed']['links']
            else:
               e['links'] = []
            if 'title' in parsed['feed']:
               e['feed_name'] = smart_str(parsed['feed']['title'], encoding='ascii', errors='ignore')
            else:
               e['feed_name'] = f['name']
            e['channel_title_plain'] = e['feed_name']
            e['channel_image'] = f['image']
            e['channel_name'] = e['feed_name']
            if 'subtitle' in parsed['feed']:
               e['subtitle'] = parsed['feed']['subtitle']
            else:
               e['subtitle']=''
            if 'link' in parsed['feed']:
               if parsed['feed']['link'].endswith('/'):
                  e['channel_link'] = e['feed_id'] = parsed['feed']['link']
               else:
                  e['channel_link'] = e['feed_id'] = parsed['feed']['link']+'/'
            else:
               e['channel_link'] = e['feed_id'] = f['feedurl']
            if 'updated' in e:
               e['date'] = dateutil.parser.parse(e['updated']).strftime("%Y-%m-%d %H:%M:%S")
               e['updated'] = dateutil.parser.parse(e['updated']).isoformat()
            elif 'published_parsed' in e:
               if e['published_parsed'] is None:
                  log.debug('%s has published date that could not be parsed' % e['feed_name'])
               else:
                  if len(e['published_parsed']) == 9:
                     e['date'] = time.strftime("%Y-%m-%d %H:%M:%S", e['published_parsed'])
                     e['updated'] = datetime.date.fromtimestamp(time.mktime(e['published_parsed'])).isoformat()
                  else:
                     e['date'] = dateutil.parser.parse(e['published_parsed']['__value__']).strftime("%Y-%m-%d %H:%M:%S")
                     e['updated'] = dateutil.parser.parse(e['published_parsed']['__value__']).isoformat()
            else:
               e['date'] = e['updated'] = '1970-01-01T00:00:00Z'
               # We really should assume the blog post is from when it is first seen for lack of a better option
               #e['date'] = e['updated'] = datetime.now().strftime("%Y-%m-%dT%H:00Z")
               log.debug("No updated or date field in entry for %s" % url)
               #pretty_print_dict(e)
            if not 'id' in e: e['id'] = e['link']
            if not 'link' in e: e['link'] = e['id']
            if not e['id'] and not e['link']:
               log.debug('%s has neither id nor link' % e['feed_name'])
            entries[e['id']] = e

         ## OPML template stuff and sidebar stuff
         feed_data = {}

         # Default these to the feed itself
         if 'feedurl' in f:
            feed_data['url'] = f['feedurl']
            feed_data['link'] = f['feedurl']

         for l in e['links']:
            if not 'type' in l:
               l['type']='text/html'
            if l['rel']=="self":
               feed_data['url'] = l['href']
            elif l['rel']=="alternate":
               if 'href' in l:
                  feed_data['link'] = l['href']
         feed_data['author'] = f['name']
         if 'title' in parsed['feed']:
            feed_data['title'] = smart_str(parsed['feed']['title'], encoding='ascii', errors='ignore')
         else:
            feed_data['title'] = f['name']
         feed_data['image'] = f['image']
         if 'feedurl' in f:
            feed_data['url'] = f['feedurl']
         else:
            log.error("%s is missing the feedurl key.  Falling back to url" % url)
            feed_data['url'] = f['url']
         lopt['Feeds'].append(feed_data)
Ejemplo n.º 7
0
   def generate(self):
      output_dir = os.path.join(cfg.OUTPUT_DIR, self.direc)
      if not os.path.exists(output_dir):
         log.info("Can't find %s directory.  Skipping generate." % output_dir)
         return
      print "Generating %s" % output_dir

      lopt = {'owner_name':self.user,
              'title':self.name,
              'feed_url':"%s%s/atom.xml" % (cfg.BASE_HREF, self.direc),
              'opml_url':"%s%s/opml.xml" % (cfg.BASE_HREF, self.direc),
              'feed_page':"%s%s/" % (cfg.BASE_HREF, self.direc),
              'updated':time.strftime("%Y-%m-%dT%H:%M:%SZ",time.gmtime(self.last_downloaded)),
              'date':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),
              'datemodified':time.strftime("%a, %d %b %Y %H:%M:%S GMT",time.gmtime(self.last_downloaded)),
              }

      ## Get the entries and sort them
      entries = {}
      lopt['Feeds']=[]
      for url, f in self.feeds.items():
         with our_db('cache') as db:
            if not url in db:
               continue
            try:
               #cache = db[url.encode("utf-8")]
               cache = db[url]
            except json.decoder.JSONDecodeError, e:
               log.debug("Json error on generating url %s: %s" % (url, e))
               continue

         parsed = cache['data']
         if not parsed or not parsed['entries']:
            log.debug("No data for %s.  Skipping." % url)
            continue
         
         for e in parsed['entries']:
            e['name'] = f['name']
            e['links'] = parsed['feed']['links']
            e['feed_name'] = smart_str(parsed['feed']['title'], encoding='ascii', errors='ignore')
            e['channel_title_plain'] = e['feed_name']
            e['channel_image'] = f['image']
            e['channel_name'] = e['feed_name']
            if 'subtitle' in parsed['feed']:
               e['subtitle'] = parsed['feed']['subtitle']
            else:
               e['subtitle']=''
            if parsed['feed']['link'].endswith('/'):
               e['channel_link'] = e['feed_id'] = parsed['feed']['link']
            else:
               e['channel_link'] = e['feed_id'] = parsed['feed']['link']+'/'

            try:
               e['date'] = dateutil.parser.parse(e['updated']).strftime("%Y-%m-%d %H:%M:%S")
               e['updated'] = dateutil.parser.parse(e['updated']).isoformat()
            except KeyError:
               e['date'] = e['updated'] = '1970-01-01T00:00:00Z'
               log.debug("No updated field in entry for %s" % url)

            if not 'id' in e: e['id'] = e['link']
            if not 'link' in e: e['link'] = e['id']
            if not e['id'] and not e['link']:
               log.debug('%s has neither id nor link' % e['feed_name'])
            entries[e['id']] = e

         ## OPML template stuff and sidebar stuff
         feed_data = {}
         for l in parsed['feed']['links']:
            if not 'type' in l:
               l['type']='text/html'
            if l['rel']=="self":
               feed_data['url'] = l['href']
            elif l['rel']=="alternate":
               if 'href' in l:
                  feed_data['link'] = l['href']
         feed_data['author'] = f['name']
         feed_data['title'] = smart_str(parsed['feed']['title'], encoding='ascii', errors='ignore')
         feed_data['image'] = f['image']
         if 'feedurl' in f:
            feed_data['url'] = f['feedurl']
         else:
            log.error("%s is missing the feedurl key.  Falling back to url" % url)
            feed_data['url'] = f['url']
         lopt['Feeds'].append(feed_data)