Ejemplo n.º 1
0
  def run(self):
    from singleton import db
    c = db.cursor()
    while True:
      item_uid, rating = self.in_q.get()
      try:
        c.execute("""update fm_items
        set item_rating=?, item_rated=julianday('now')
        where item_uid=?""", [rating, item_uid])
        fb_token = param.settings.get('fb_token', None)
        if rating == 1 and fb_token:
          c.execute("""select feed_uid, item_link, item_title, feed_private
          from fm_items, fm_feeds
          where item_uid=? and feed_uid=item_feed_uid""",
                    [item_uid])
          feed_uid, url, title, private = c.fetchone()
        db.commit()
        if rating == 1 and fb_token and not private:
          callout = random.choice(
            ['Interesting: ', 'Notable: ', 'Recommended: ', 'Thumbs-up: ',
             'Noteworthy: ', 'FYI: ', 'Worth reading: '])
          try:
            social.fb_post(fb_token, callout + title, url)
          except social.ExpiredToken:
            notification(db, c, feed_uid, 'Service notification',
              'The Facebook access token has expired',
              link='/settings#facebook')

      except:
        util.print_stack()
    # this will never be reached
    c.close()
Ejemplo n.º 2
0
 def apply(self, content, *args, **kwargs):
   item = args[1]
   if self.link_substr in item['link']:
     try:
       # check if this item has not already been loaded before
       guid = item['id']
       from singleton import db, sqlite
       c = db.cursor()
       if sqlite.paramstyle == 'qmark':
         c.execute("select item_link from fm_items where item_guid=?", [guid])
       elif sqlite.paramstyle == 'pyformat':
         c.execute("select item_link from fm_items where item_guid=%guid)s",
                   {'guid': guid})
       link = c.fetchone()
       c.close()
       if link:
         print >> param.log, 'not dereferencing', guid, '->', link[0]
         item['link'] = link[0]
         return content
       # we haven't seen this article before, buck up and load it
       deref = urllib2.urlopen(item['link']).read()
       m = self.re.search(deref)
       if m and m.groups():
         item['link'] = m.groups()[0]
     except:
       util.print_stack()
   return content
Ejemplo n.º 3
0
def update_feed(db, c, f, feed_uid, feed_xml, feed_etag, feed_modified,
                feed_dupcheck=None):
  print >> param.activity, feed_xml
  if 'why' in f and f['why'] == 'no change since Etag':
    return
  # check for errors - HTTP code 304 means no change
  if not hasattr(f, 'feed') \
     or 'title' not in f.feed and 'link' not in f.feed:
    if not hasattr(f, 'feed'):
      print >> param.log, """FFFFF not hasattr(f, 'feed')""",
    else:
      print >> param.log, """FFFFF title=%r link=%r""" % (
        'title' not in f.feed,
        'link' not in f.feed
      ),
    if 'why' in f:
      print >> param.log, feed_xml, f['why']
    else:
      print >> param.log, feed_xml
      
    # error or timeout - increment error count
    increment_errors(db, c, feed_uid)
  else:
    # no error - reset etag and/or modified date and error count
    clear_errors(db, c, feed_uid, f)
  try:
    process_parsed_feed(db, c, f, feed_uid, feed_dupcheck)
  except:
    util.print_stack(['c', 'f'])
Ejemplo n.º 4
0
def evaluate_rules(item, feed, feed_uid, exempt):
  for rule in rules * (not exempt) + feed_rules.get(feed_uid, list()):
    try:
      if rule.test(item, feed, feed_uid):
        return True, rule
    except:
      util.print_stack(['f'])
  return False, None
Ejemplo n.º 5
0
def evaluate_rules(item, feed, feed_uid, exempt):
  for rule in rules * (not exempt) + feed_rules.get(feed_uid, list()):
    try:
      if rule.test(item, feed, feed_uid):
        return True, rule
    except:
      util.print_stack(['f'])
  return False, None
Ejemplo n.º 6
0
 def run(self):
   while True:
     # XXX should wrap this in a try/except clause
     self.event.wait(param.refresh_interval)
     print >> param.activity, time.ctime(), '- refreshing feeds'
     try:
       update()
     except:
       util.print_stack()
     self.event.clear()
Ejemplo n.º 7
0
 def run(self):
   while True:
     # XXX should wrap this in a try/except clause
     self.event.wait(param.refresh_interval)
     print >> param.activity, time.ctime(), '- refreshing feeds'
     try:
       update()
     except:
       util.print_stack()
     self.event.clear()
Ejemplo n.º 8
0
def update_feed(db, c, f, feed_uid, feed_xml, feed_etag, feed_modified,
                feed_dupcheck=None):
  print >> param.activity, feed_xml
  # check for errors - HTTP code 304 means no change
  if not hasattr(f, 'feed') or 'status' not in f or \
         'title' not in f.feed and 'link' not in f.feed and \
         ('status' not in f or f['status'] not in [304]):
    # error or timeout - increment error count
    increment_errors(db, c, feed_uid)
  else:
    # no error - reset etag and/or modified date and error count
    clear_errors(db, c, feed_uid, f)
  try:
    process_parsed_feed(db, c, f, feed_uid, feed_dupcheck)
  except:
    util.print_stack(['c', 'f'])
Ejemplo n.º 9
0
def fetch_feed(feed_uid, feed_xml, feed_etag, feed_modified):
  if not feed_etag:
    feed_etag = None
  if not feed_modified:
    feed_modified = None
  try:
    f = feedparser.parse(feed_xml, etag=feed_etag, modified=feed_modified)
  except socket.timeout:
    if param.debug:
      print >> param.log, 'EEEEE error fetching feed', feed_xml
    f = {'channel': {}, 'items': []}
  except:
    if param.debug:
      util.print_stack()
    f = {'channel': {}, 'items': []}
  normalize.normalize_feed(f)
  return f
Ejemplo n.º 10
0
def update(where_clause=''):
  from singleton import db
  c = db.cursor()
  # refresh filtering rules
  filters.load_rules(db, c)
  # at 3AM by default, perform house-cleaning
  if time.localtime()[3] == param.backup_hour:
    cleanup(db, c)
  # create worker threads and the queues used to communicate with them
  work_q = Queue.Queue()
  process_q = Queue.Queue()
  workers = []
  for i in range(param.feed_concurrency):
    workers.append(FeedWorker(i + 1, work_q, process_q))
    workers[-1].start()
  # assign work
  c.execute("""select feed_uid, feed_xml, feed_etag, feed_dupcheck,
  strftime('%s', feed_modified) from fm_feeds where feed_status=0 """
            + where_clause)
  for feed_uid, feed_xml, feed_etag, feed_dupcheck, feed_modified in c:
    if feed_modified:
      feed_modified = float(feed_modified)
      feed_modified = time.localtime(feed_modified)
    else:
      feed_modified = None
    work_q.put((feed_uid, feed_xml, feed_etag, feed_modified, feed_dupcheck))
  # None is an indication for workers to stop
  for i in range(param.feed_concurrency):
    work_q.put(None)
  workers_left = param.feed_concurrency
  while workers_left > 0:
    feed_info = process_q.get()
    # exited worker
    if not feed_info:
      workers_left -= 1
    else:
      try:
        update_feed(db, c, *feed_info)
      except:
        util.print_stack()
      db.commit()
    # give reader threads an opportunity to get their work done
    time.sleep(1)
  c.close()
Ejemplo n.º 11
0
def update(where_clause=''):
    with dbop.db() as db:
        c = db.cursor()
        # refresh filtering rules
        filters.load_rules(c)
        # at 3AM by default, perform house-cleaning
        if time.localtime()[3] == param.backup_hour:
            cleanup(db, c)
        # create worker threads and the queues used to communicate with them
        work_q = Queue.Queue()
        process_q = Queue.Queue()
        workers = []
        for i in range(param.feed_concurrency):
            workers.append(FeedWorker(i + 1, work_q, process_q))
            workers[-1].start()
        # assign work
        c.execute("""select feed_uid, feed_xml, feed_etag, feed_dupcheck,
    strftime('%s', feed_modified) from fm_feeds where feed_status=0 """ +
                  where_clause)
        for feed_uid, feed_xml, feed_etag, feed_dupcheck, feed_modified in c:
            if feed_modified:
                feed_modified = float(feed_modified)
                feed_modified = time.localtime(feed_modified)
            else:
                feed_modified = None
            work_q.put(
                (feed_uid, feed_xml, feed_etag, feed_modified, feed_dupcheck))
        # None is an indication for workers to stop
        for i in range(param.feed_concurrency):
            work_q.put(None)
        workers_left = param.feed_concurrency
        while workers_left > 0:
            feed_info = process_q.get()
            # exited worker
            if not feed_info:
                workers_left -= 1
            else:
                try:
                    update_feed(db, c, *feed_info)
                except:
                    util.print_stack()
                db.commit()
            # give reader threads an opportunity to get their work done
            time.sleep(1)
Ejemplo n.º 12
0
def load_rules(db, c):
  global loaded, rules, feed_rules
  if loaded: return
  rules = []
  feed_rules = dict()
  try:
    try:
      c.execute("""select rule_uid, rule_type, rule_text, rule_feed_uid,
      strftime('%s', rule_expires)
      from fm_rules
      where rule_expires is null or rule_expires > julianday('now')""")
      for uid, rtype, rule, feed_uid, expires in c:
        if expires: expires = int(expires)
        if feed_uid:
          container = feed_rules.setdefault(feed_uid, list())
        else:
          container = rules
        if rtype == 'python':
          rule = PythonRule(uid, expires, rule)
          container.append(rule)
        elif rtype == 'tag':
          rule = TagRule(uid, expires, rule)
          container.append(rule)
        elif rtype == 'author':
          rule = AuthorRule(uid, expires, rule)
          container.append(rule)
        elif rtype.startswith('union_'):
          # XXX this convention of adding a second rule object with UID -uid
          # XXX is a ugly hack
          container.append(KeywordRule(
            -uid, expires, rule, rtype.replace('union_', 'title_')))
          container.append(KeywordRule(
            uid, expires, rule, rtype.replace('union_', 'content_')))
        else:
          container.append(KeywordRule(uid, expires, rule, rtype))
      c.execute("""select feed_uid, feed_filter from fm_feeds
      where feed_filter is not null""")
      for feed_uid, rule in c:
        rule = PythonRule('feed_%d' % feed_uid, None, rule)
        feed_rules.setdefault(feed_uid, list()).append(rule)
    except:
      util.print_stack()
  finally:
    loaded = True
Ejemplo n.º 13
0
def load_rules(c):
  global loaded, rules, feed_rules
  if loaded: return
  rules = []
  feed_rules = dict()
  try:
    try:
      for uid, rtype, rule, feed_uid, expires in \
          c.execute("""select rule_uid, rule_type, rule_text, rule_feed_uid,
          strftime('%s', rule_expires)
          from fm_rules
          where rule_expires is null or rule_expires > julianday('now')"""):
        if expires: expires = int(expires)
        if feed_uid:
          container = feed_rules.setdefault(feed_uid, list())
        else:
          container = rules
        if rtype == 'python':
          rule = PythonRule(uid, expires, rule)
          container.append(rule)
        elif rtype == 'tag':
          rule = TagRule(uid, expires, rule)
          container.append(rule)
        elif rtype == 'author':
          rule = AuthorRule(uid, expires, rule)
          container.append(rule)
        elif rtype.startswith('union_'):
          # XXX this convention of adding a second rule object with UID -uid
          # XXX is a ugly hack
          container.append(KeywordRule(
            -uid, expires, rule, rtype.replace('union_', 'title_')))
          container.append(KeywordRule(
            uid, expires, rule, rtype.replace('union_', 'content_')))
        else:
          container.append(KeywordRule(uid, expires, rule, rtype))
      for feed_uid, rule in \
          c.execute("""select feed_uid, feed_filter from fm_feeds
          where feed_filter is not null"""):
        rule = PythonRule('feed_%d' % feed_uid, None, rule)
        feed_rules.setdefault(feed_uid, list()).append(rule)
    except:
      util.print_stack()
  finally:
    loaded = True
Ejemplo n.º 14
0
    def run(self):
        while True:
            item_uid = None
            try:
                item_uid, rating = self.in_q.get()
                with dbop.db() as db:
                    c = db.cursor()
                    try:
                        c.execute(
                            """update fm_items
            set item_rating=?, item_rated=julianday('now')
            where item_uid=?""", [rating, item_uid])
                        fb_token = param.settings.get('fb_token', None)
                        if rating == 1 and fb_token:
                            c.execute(
                                """select feed_uid, item_link, item_title, feed_private
              from fm_items, fm_feeds
              where item_uid=? and feed_uid=item_feed_uid""", [item_uid])
                            feed_uid, url, title, private = c.fetchone()
                        db.commit()
                        if rating == 1 and fb_token and not private:
                            callout = random.choice([
                                'Interesting: ', 'Notable: ', 'Recommended: ',
                                'Thumbs-up: ', 'Noteworthy: ', 'FYI: ',
                                'Worth reading: '
                            ])
                            try:
                                social.fb_post(fb_token, callout + title, url)
                            except social.ExpiredToken:
                                notification(
                                    db,
                                    c,
                                    feed_uid,
                                    'Service notification',
                                    'The Facebook access token has expired',
                                    link='/settings#facebook')

                    except:
                        util.print_stack()
            except:
                util.print_stack()
                if item_uid is not None:
                    self.in_q.put((item_uid, rating))
Ejemplo n.º 15
0
 def step(self, rating, date, decay):
   """The aggregate function takes the following parameters:
   status: value of item_rating
   date:   value of item_created
   decay:  half-life to use, in days
   """
   # articles older than param.garbage_items cannot be counted towards
   # the SNR, as the uninteresting ones have been purged and thus skew
   # the metric towards 100%
   try:
     if self.ref_date - date < param.garbage_items:
       # by convention, 0 means do not decay (i.e. infinite half-life)
       if decay == 0:
         decay = 1
       else:
         decay = .5 ** ((self.ref_date - date) / decay)
       self.sum_rated += decay * int(rating not in [0, -2])
       self.sum_good += decay * int(rating == 1)
   except:
     util.print_stack()
     raise
Ejemplo n.º 16
0
def fetch_feed(feed_uid, feed_xml, feed_etag, feed_modified):
    if not feed_etag:
        feed_etag = None
    if not feed_modified:
        feed_modified = None
    try:
        r = requests.get(feed_xml, headers={'If-None-Match': feed_etag})
        if r.content == '':
            return {'channel': {}, 'items': [], 'why': 'no change since Etag'}
        f = feedparser.parse(r.content,
                             etag=r.headers.get('Etag'),
                             modified=feed_modified)
    except (socket.timeout, requests.exceptions.RequestException) as e:
        if param.debug:
            print >> param.log, 'EEEEE error fetching feed', feed_xml, e
        f = {'channel': {}, 'items': [], 'why': repr(e)}
    except:
        if param.debug:
            util.print_stack()
        f = {'channel': {}, 'items': [], 'why': repr(sys.exc_info[1])}
    normalize.normalize_feed(f)
    return f
Ejemplo n.º 17
0
 def apply(self, content, *args, **kwargs):
   item = args[1]
   if self.link_substr in item['link']:
     try:
       # check if this item has not already been loaded before
       guid = item['id']
       with dbop.db() as db:
         c = db.cursor()
         c.execute("select item_link from fm_items where item_guid=?",
                   [guid])
         link = c.fetchone()
         c.close()
         if link:
           print >> param.log, 'not dereferencing', guid, '->', link[0]
           item['link'] = link[0]
           return content
         # we haven't seen this article before, buck up and load it
         deref = requests.get(item['link']).content
         m = self.re.search(deref)
         if m and m.groups():
           item['link'] = m.groups()[0]
     except:
       util.print_stack()
   return content
Ejemplo n.º 18
0
def dereference(url, seen=None, level=0):
    """Recursively dereference a URL"""
    # this set is used to detect redirection loops
    if seen is None:
        seen = set([url])
    else:
        seen.add(url)
    # stop recursion if it is too deep
    if level > 16:
        return url
    try:
        r = requests.get(url, allow_redirects=False)
        if not r.is_redirect:
            return url
        else:
            # break a redirection loop if it occurs
            redir = r.headers.get('Location')
            if True not in [
                    redir.startswith(p)
                    for p in ['http://', 'https://', 'ftp://']
            ]:
                return url
            if redir in seen:
                return url
            # some servers redirect to Unicode URLs, which are not legal
            try:
                unicode(redir)
            except UnicodeDecodeError:
                return url
            # there might be several levels of redirection
            return dereference(redir, seen, level + 1)
    except (requests.exceptions.RequestException, ValueError, socket.error):
        return url
    except:
        util.print_stack()
        return url
Ejemplo n.º 19
0
  def process_request(self):
    try:
      if self.path in ['', '/']:
        self.browser_output(301, None, 'This document has moved.',
                            ['Location: /view'])
        return
      path, query_string = urlparse.urlparse(self.path)[2:5:2]
      vars = []
      if query_string:
        # parse_qsl does not comply with RFC 3986, we have to decode UTF-8
        query_list = [(n, v.decode('UTF-8'))
                      for n, v in urlparse.parse_qsl(query_string, 1)]
        self.input.update(dict(query_list))

      if param.debug:
        logging.info((self.command, self.path, self.request_version, vars))

      if path.endswith('.gif') and path[1:] in self.images:
        self.browser_output(200, 'image/gif', self.images[path[1:]],
                            http_headers=no_expire)
        return

      if path.endswith('.js') and path[1:] in self.rsrc:
        self.browser_output(200, 'text/javascript', self.rsrc[path[1:]],
                            http_headers=no_expire)
        return

      if path.startswith('/tiny_mce'):
        # guard against attempts to subvert security using ../
        path = os.path.normpath('.' + path)
        assert path.startswith('tiny_mce')
        self.set_mime_type(path)
        self.browser_output(200, self.mime_type, open(path).read(),
                            http_headers=no_expire)
        return

      if path.count('favicon.ico') > 0:
        self.favicon()

      if path.endswith('.css'):
        path = path.replace('.css', '_css')
        tmpl = path.split('/', 1)[1].strip('/')
        self.use_template(tmpl, [self.input])

      if not self.require_auth(param.auth_dict):
        return
      
      if path.startswith('/redirect/'):
        from singleton import db
        c = db.cursor()
        item_uid = int(path[10:])
        c.execute('select item_link from fm_items where item_uid=%d'
                  % item_uid)
        redirect_url = c.fetchone()[0]
        c.close()
        self.browser_output(301, None, 'This document has moved.',
                            ['Location: ' + redirect_url])
        return

      if path.startswith('/threads'):
        frames = sys._current_frames()
        row = 0
        out = []
        if singleton.c_opened:
          out.append('<h1>Open Cursors</h1>\n')
          for curs, tb in singleton.c_opened.iteritems():
            if curs not in singleton.c_closed:
              row += 1
              if row % 2:
                color = '#ddd'
              else:
                color = 'white'
              out.append('<div style="background-color: ' + color + '">\n<pre>')
              out.append(curs.replace('<', '&lt;').replace('>', '&gt;') + '\n')
              out.append('\n'.join(tb[:-2]))
              out.append('</pre></div>\n')
        out.append('<h1>Threads</h1>\n')
        row = 0
        for thread_id, frame in sorted(frames.iteritems()):
          if thread_id == threading.currentThread()._Thread__ident:
            continue
          row += 1
          if row % 2:
            color = '#ddd'
          else:
            color = 'white'
          out.append('<div style="background-color: ' + color + '">\n<pre>')
          out.append('Thread %s (%d refs)\n'
                     % (thread_id, sys.getrefcount(frame)))
          out.append(''.join(traceback.format_stack(frame)).replace(
            '&', '&amp;').replace('<', '&lt;').replace('>', '&gt;'))
          out.append('\n<hr>\n')
          out.append(pprint.pformat(frame.f_locals).replace(
            '&', '&amp;').replace('<', '&lt;').replace('>', '&gt;'))
          out.append('\n</pre>\n</div>\n')
        del frames
        self.browser_output(200, 'text/html', ''.join(out))
        return

      if path.startswith('/xmlfeedback/'):
        op, item_uid = path.split('/')[2::2]
        item_uid = item_uid.split('.')[0]
        # for safety, these operations should be idempotent
        if op in ['promote', 'demote', 'basic', 'yappi']:
          if op != 'yappi':
            item_uid = int(item_uid)
          getattr(self, 'op_' + op)(item_uid)
        self.xml()
        return

      if path.startswith('/stem'):
        txt = self.input['q']
        stem = ' '.join(normalize.stem(normalize.get_words(txt)))
        self.browser_output(200, 'text/plain', stem)
        return

      if path.startswith('/add_kw_rule'):
        from singleton import db
        c = db.cursor()
        try:
          filters.add_kw_rule(db, c, **self.input)
        except:
          util.print_stack()
        db.commit()
        c.close()
        self.xml()
        return

      if path.startswith('/del_kw_rule'):
        from singleton import db
        c = db.cursor()
        try:
          filters.del_kw_rule(db, c, **self.input)
        except:
          util.print_stack()
        db.commit()
        c.close()
        self.xml()
        return

      if path.startswith('/stats'):
        from singleton import db
        c = db.cursor()
        c.execute("""select date(item_loaded) as date, count(*) as articles,
        sum(case when item_rating=1 then 1 else 0 end) as interesting,
        sum(case when item_rating=0 then 1 else 0 end) as unread,
        sum(case when item_rating=-1 then 1 else 0 end) as filtered
        from fm_items
        where item_loaded > julianday('now') - 30
        group by 1 order by 1""")
        csvfile = cStringIO.StringIO()
        out = csv.writer(csvfile, dialect='excel', delimiter=',')
        out.writerow([col[0].capitalize() for col in c.description])
        for row in c:
          out.writerow(row)
        self.browser_output(200, 'text/csv', csvfile.getvalue())
        csvfile.close()
        c.close()
        return

      if path.endswith('.css'):
        path = path.replace('.css', '_css')

      tmpl = path.split('/', 1)[1].strip('/')
      self.use_template(tmpl, [self.input])
    except TembozTemplate.Redirect, e:
      redirect_url = e.args[0]
      self.browser_output(301, None, 'This document has moved.',
                          ['Location: ' + redirect_url])
      return
Ejemplo n.º 20
0
def add_feed(feed_xml):
  """Try to add a feed. Returns a tuple (feed_uid, num_added, num_filtered)"""
  with dbop.db() as db:
    c = db.cursor()
    feed_xml = feed_xml.replace('feed://', 'http://')
    # verify the feed
    r = requests.get(feed_xml)
    f = feedparser.parse(r.content)
    if 'url' not in f:
      f['url'] = feed_xml
    # CVS versions of feedparser are not throwing exceptions as they should
    # see:
    # http://sourceforge.net/tracker/index.php?func=detail&aid=1379172&group_id=112328&atid=661937
    if not f.feed or ('link' not in f.feed or 'title' not in f.feed):
      # some feeds have multiple links, one for self and one for PuSH
      if f.feed and 'link' not in f.feed and 'links' in f.feed:
        try:
          for l in f.feed['links']:
            if l['rel'] == 'self':
              f.feed['link'] = l['href']
        except KeyError:
          pass
    if not f.feed or ('link' not in f.feed or 'title' not in f.feed):
      # try autodiscovery
      try:
        feed_xml = AutoDiscoveryHandler().feed_url(feed_xml)
      except HTMLParser.HTMLParseError:
        # in desperate conditions, regexps ride to the rescue
        try:
          feed_xml = re_autodiscovery(feed_xml)[0][1]
        except:
          util.print_stack()
          raise AutodiscoveryParseError
      if not feed_xml:
        raise ParseError
      r = requests.get(feed_xml)
      f = feedparser.parse(r.content)
      if not f.feed:
        raise ParseError
    # we have a valid feed, normalize it
    normalize.normalize_feed(f)
    feed = {
      'xmlUrl': f['url'],
      'htmlUrl': str(f.feed['link']),
      'etag': r.headers.get('Etag'),
      'title': f.feed['title'].encode('ascii', 'xmlcharrefreplace'),
      'desc': f.feed['description'].encode('ascii', 'xmlcharrefreplace')
      }
    for key, value in feed.items():
      if type(value) == str:
        feed[key] = value
    filters.load_rules(c)
    try:
      c.execute("""insert into fm_feeds
      (feed_xml, feed_etag, feed_html, feed_title, feed_desc) values
      (:xmlUrl, :etag, :htmlUrl, :title, :desc)""", feed)
      feed_uid = c.lastrowid
      num_added, num_filtered = process_parsed_feed(db, c, f, feed_uid)
      db.commit()
      return feed_uid, feed['title'], num_added, num_filtered
    except sqlite3.IntegrityError, e:
      if 'feed_xml' in str(e):
        db.rollback()
        raise FeedAlreadyExists
      else:
        db.rollback()
        raise UnknownError(str(e))
Ejemplo n.º 21
0
def process_parsed_feed(db, c, f, feed_uid, feed_dupcheck=None, exempt=None):
  """Insert the entries from a feedparser parsed feed f in the database using
the cursor c for feed feed_uid.
Returns a tuple (number of items added unread, number of filtered items)"""
  num_added = 0
  num_filtered = 0
  filters.load_rules(c)
  # check if duplicate title checking is in effect
  if feed_dupcheck is None:
    c.execute("select feed_dupcheck from fm_feeds where feed_uid=?",
              [feed_uid])
    feed_dupcheck = bool(c.fetchone()[0])
  # check if the feed is exempt from filtering
  if exempt is None:
    c.execute("select feed_exempt from fm_feeds where feed_uid=?", [feed_uid])
    exempt = bool(c.fetchone()[0])
  # the Radio convention is reverse chronological order
  f['items'].reverse()
  for item in f['items']:
    try:
      normalize.normalize(item, f)
    except:
      util.print_stack()
      continue
    # evaluate the FilteringRules
    skip, rule = filters.evaluate_rules(item, f, feed_uid, exempt)
    filtered_by = None
    if skip:
      skip = -2
      if type(rule.uid) == int:
        filtered_by = rule.uid
      else:
        # XXX clunky convention for feed_rule, but that should disappear
        # XXX eventually
        filtered_by = 0
    title   = item['title']
    link    = item['link']
    guid    = item['id']
    author = item['author']
    created = item['created']
    modified = item['modified']
    if not modified:
      modified = None
    content = item['content']
    # check if the item already exists, using the GUID as key
    # but cache all seen GUIDs in a dictionary first, since most articles are
    # existing ones and we can save a database query this way
    if feed_uid in feed_guid_cache and guid in feed_guid_cache[feed_uid]:
      # existing entry and we've seen it before in this process instance
      # update the time stamp to prevent premature garbage-collection
      # in prune_feed_guid_cache
      feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time()
      continue
    else:
      feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time()
    # not seen yet, it may or may not be a duplicate, we have to find out the
    # hard way
    c.execute("""select item_uid, item_link,
    item_loaded, item_created, item_modified,
    item_md5hex, item_title, item_content, item_creator
    from fm_items where item_feed_uid=? and item_guid=?""",
              [feed_uid, guid])
    l = c.fetchall()
    # unknown GUID, but title/link duplicate checking may be in effect
    if not l:
      if feed_dupcheck:
        c.execute("""select count(*) from fm_items
        where item_feed_uid=? and (item_title=? or item_link=?)""",
                  [feed_uid, title, link])
        l = bool(c.fetchone()[0])
        if l:
          print >> param.activity, 'DUPLICATE TITLE', title
      # XXX Runt items (see normalize.py) are almost always spurious, we just
      # XXX skip them, although we may revisit this decision in the future
      if not l and item.get('RUNT', False):
        print >> param.activity, 'RUNT ITEM', item
        l = True
    # GUID already exists, this is a change
    else:
      assert len(l) == 1
      (item_uid, item_link, item_loaded, item_created, item_modified,
       item_md5hex, item_title, item_content, item_creator) = l[0]
      # if this is a feed without timestamps, use our timestamp to determine
      # the oldest item in the feed XML file
      if 'oldest' in f and f['oldest'] == '1970-01-01 00:00:00':
        if 'oldest_ts' not in f:
          f['oldest_ts'] = item_created
        else:
          f['oldest_ts'] = min(f['oldest_ts'], item_created)
      # XXX update item here
      # XXX update tags if required
    # GUID doesn't exist yet, insert it
    if not l:
      # finally, dereference the URL to get rid of annoying tracking servers
      # like feedburner, but only do this once to avoid wasting bandwidth
      link = normalize.dereference(link)
      try:
        c.execute("""insert into fm_items (item_feed_uid, item_guid,
        item_created,   item_modified, item_link, item_md5hex,
        item_title, item_content, item_creator, item_rating, item_rule_uid)
        values
        (?, ?, julianday(?), julianday(?), ?, ?, ?, ?, ?, ?, ?)""",
                  [feed_uid, guid, created, modified, link,
                   hashlib.md5(content).hexdigest(),
                   title, content, author, skip, filtered_by])
        # if we have tags, insert them
        # note: feedparser.py handles 'category' as a special case, so we
        # need to work around that to get to the data
        if item['item_tags']:
          c.execute("""select item_uid
          from fm_items where item_feed_uid=? and item_guid=?""",
                    [feed_uid, guid])
          item_uid = c.fetchone()[0]
          for tag in item['item_tags']:
            c.execute("""insert or ignore into fm_tags (tag_name, tag_item_uid)
            values (?, ?)""", [tag, item_uid])
        if skip:
          num_filtered += 1
          print >> param.activity, 'SKIP', title, rule
        else:
          num_added += 1
          print >> param.activity, ' ' * 4, title
      except:
        util.print_stack(['c', 'f'])
        continue
  # update timestamp of the oldest item still in the feed file
  if 'oldest' in f and f['oldest'] != '9999-99-99 99:99:99':
    if f['oldest'] == '1970-01-01 00:00:00' and 'oldest_ts' in f:
      c.execute("update fm_feeds set feed_oldest=? where feed_uid=?",
                [f['oldest_ts'], feed_uid])
    else:
      c.execute("""update fm_feeds set feed_oldest=julianday(?)
      where feed_uid=?""", [f['oldest'], feed_uid])
  
  return (num_added, num_filtered)
Ejemplo n.º 22
0
def normalize(item, f, run_filters=True):
  # get rid of RDF lossage...
  for key in ['title', 'link', 'created', 'modified', 'author',
              'content', 'content_encoded', 'description']:
    if type(item.get(key)) == list:
      if len(item[key]) == 1:
        item[key] = item[key][0]
      else:
        candidate = [i for i in item[key] if i.get('type') == 'text/html']
        if len(candidate) > 1 and key == 'content':
          candidate = sorted(candidate,
                             key=lambda i: len(i.get('value', '')),
                             reverse=True)[:1]
        if len(candidate) == 1:
          item[key] = candidate[0]
        else:
          # XXX not really sure how to handle these cases
          print >> param.log, 'E' * 16, 'ambiguous RDF', key, item[key]
          item[key] = item[key][0]
    if isinstance(item.get(key), dict) and 'value' in item[key]:
      item[key] = item[key]['value']
  ########################################################################
  # title
  if 'title' not in item or not item['title'].strip():
    item['title'] = 'Untitled'
  # XXX for debugging
  if type(item['title']) not in [str, unicode]:
    print >> param.log, 'TITLE' * 15
    import code
    from sys import exit
    code.interact(local=locals())
  item['title_lc'] =   lower(item['title'])
  item['title_words_exact'] =  get_words(item['title_lc'])
  item['title_words'] =  stem(item['title_words_exact'])
  ########################################################################
  # link
  #
  # The RSS 2.0 specification allows items not to have a link if the entry
  # is complete in itself
  # that said this is almost always spurious, so we filter it below
  if 'link' not in item:
    item['link'] = f['channel']['link']
    # We have to be careful not to assign a default URL as the GUID
    # otherwise only one item will ever be recorded
    if 'id' not in item:
      item['id'] = 'HASH_CONTENT'
      item['RUNT'] = True
  if type(item['link']) == unicode:
    item['link'] = str(item['link'].encode('UTF-8'))
  if type(item['link']) != str:
    print >> param.log, 'LINK IS NOT str', repr(item['link'])
  # XXX special case handling for annoying Sun/Roller malformed entries
  if 'blog.sun.com' in item['link'] or 'blog.sun.com' in item['link']:
    item['link'] = item['link'].replace(
      'blog.sun.com', 'blogs.sun.com').replace(
      'blogs.sun.com/page', 'blogs.sun.com/roller/page')
  ########################################################################
  # GUID
  if 'id' not in item:
    item['id'] = item['link']
  ########################################################################
  # creator
  if 'author' not in item or item['author'] == 'Unknown':
    item['author'] = 'Unknown'
    if 'author' in f['channel']:
      item['author'] = f['channel']['author']
  ########################################################################
  # created amd modified dates
  if 'modified' not in item:
    item['modified'] = f['channel'].get('modified')
  # created - use modified if not available
  if 'created' not in item:
    if 'modified_parsed' in item:
      created = item['modified_parsed']
    else:
      created = None
  else:
    created = item['created_parsed']
  if not created:
    # XXX use HTTP last-modified date here
    created = time.gmtime()
    # feeds that do not have timestamps cannot be garbage-collected
    # XXX need to find a better heuristic, as high-volume sites such as
    # XXX The Guardian, CNET.com or Salon.com lack item-level timestamps
    f['oldest'] = '1970-01-01 00:00:00'
  created = fix_date(created)
  item['created'] = time.strftime(date_fmt, created)
  # keep track of the oldest item still in the feed file
  if 'oldest' not in f:
    f['oldest'] = '9999-99-99 99:99:99'
  if item['created'] < f['oldest']:
    f['oldest'] = item['created']
  # finish modified date
  if 'modified_parsed' in item and item['modified_parsed']:
    modified = fix_date(item['modified_parsed'])
    # add a fudge factor time window within which modifications are not
    # counted as such, 10 minutes here
    if not modified or abs(time.mktime(modified) - time.mktime(created)) < 600:
      item['modified'] = None
    else:
      item['modified'] = time.strftime(date_fmt, modified)
  else:
    item['modified'] = None
  ########################################################################
  # content
  if 'content' in item:
    content = item['content']
  elif 'content_encoded' in item:
    content = item['content_encoded']
  elif 'description' in item:
    content = item['description']
  else:
    content = '<a href="' + item['link'] + '">' + item['title'] + '</a>'
  if not content:
    content = '<a href="' + item['link'] + '">' + item['title'] + '</a>'
  # strip embedded NULs as a defensive measure
  content = content.replace('\0', '')
  # apply ad filters and other degunking to content
  old_content = None
  while old_content != content:
    old_content = content
    try:
      for filter in transform.filter_list:
        content = filter.apply(content, f, item)
    except:
      util.print_stack(black_list=['item'])
  # balance tags like <b>...</b>
  content = balance(content)
  content_lc = lower(content)
  # the content might have invalid 8-bit characters.
  # Heuristic suggested by Georg Bauer
  if type(content) != unicode:
    try:
      content = content.decode('utf-8')
    except UnicodeError:
      content = content.decode('iso-8859-1')
  #
  item['content'] = content
  # we recalculate this as content may have changed due to tag rebalancing, etc
  item['content_lc'] = lower(content)
  item['content_words_exact'] = get_words(item['content_lc'])
  item['content_words'] = stem(item['content_words_exact'])
  item['union_lc'] = item['title_lc'] + '\n' + item['content_lc']
  item['union_words'] = item['title_words'].union(item['content_words'])
  item['urls'] = url_re.findall(content)
  ########################################################################
  # categories/tags
  # we used 'category' before, but 'category' and 'categories' are
  # intercepted by feedparser.FeedParserDict.__getitemm__ and treated as
  # special case
  if 'tags' in item and type(item['tags']) == list:
    item['item_tags'] = set([lower(t['term']) for t in item['tags']])
  else:
    item['item_tags'] = []
  ########################################################################
  # map unicode
  for key in ['title', 'link', 'created', 'modified', 'author', 'content']:
    if type(item.get(key)) == unicode:
      item[key] = item[key].encode('ascii', 'xmlcharrefreplace')
  # hash the content as the GUID if required
  if item['id'] == 'HASH_CONTENT':
    item['id']= hashlib.md5(item['title'] + item['content']).hexdigest()
Ejemplo n.º 23
0
def normalize(item, f, run_filters=True):
    # get rid of RDF lossage...
    for key in [
            'title', 'link', 'created', 'modified', 'author', 'content',
            'content_encoded', 'description'
    ]:
        if type(item.get(key)) == list:
            if len(item[key]) == 1:
                item[key] = item[key][0]
            else:
                candidate = [
                    i for i in item[key] if i.get('type') == 'text/html'
                ]
                if len(candidate) > 1 and key == 'content':
                    candidate = sorted(candidate,
                                       key=lambda i: len(i.get('value', '')),
                                       reverse=True)[:1]
                if len(candidate) == 1:
                    item[key] = candidate[0]
                else:
                    # XXX not really sure how to handle these cases
                    print >> param.log, 'E' * 16, 'ambiguous RDF', key, item[
                        key]
                    item[key] = item[key][0]
        if isinstance(item.get(key), dict) and 'value' in item[key]:
            item[key] = item[key]['value']
    ########################################################################
    # title
    if 'title' not in item or not item['title'].strip():
        item['title'] = 'Untitled'
    # XXX for debugging
    if type(item['title']) not in [str, unicode]:
        print >> param.log, 'TITLE' * 15
        import code
        from sys import exit
        code.interact(local=locals())
    item['title_lc'] = lower(item['title'])
    item['title_words_exact'] = get_words(item['title_lc'])
    item['title_words'] = stem(item['title_words_exact'])
    ########################################################################
    # link
    #
    # The RSS 2.0 specification allows items not to have a link if the entry
    # is complete in itself
    # that said this is almost always spurious, so we filter it below
    if 'link' not in item:
        item['link'] = f['channel']['link']
        # We have to be careful not to assign a default URL as the GUID
        # otherwise only one item will ever be recorded
        if 'id' not in item:
            item['id'] = 'HASH_CONTENT'
            item['RUNT'] = True
    if type(item['link']) == unicode:
        item['link'] = str(item['link'].encode('UTF-8'))
    if type(item['link']) != str:
        print >> param.log, 'LINK IS NOT str', repr(item['link'])
    # XXX special case handling for annoying Sun/Roller malformed entries
    if 'blog.sun.com' in item['link'] or 'blog.sun.com' in item['link']:
        item['link'] = item['link'].replace('blog.sun.com',
                                            'blogs.sun.com').replace(
                                                'blogs.sun.com/page',
                                                'blogs.sun.com/roller/page')
    ########################################################################
    # GUID
    if 'id' not in item:
        item['id'] = item['link']
    ########################################################################
    # creator
    if 'author' not in item or item['author'] == 'Unknown':
        item['author'] = 'Unknown'
        if 'author' in f['channel']:
            item['author'] = f['channel']['author']
    ########################################################################
    # created amd modified dates
    if 'modified' not in item:
        item['modified'] = f['channel'].get('modified')
    # created - use modified if not available
    if 'created' not in item:
        if 'modified_parsed' in item:
            created = item['modified_parsed']
        else:
            created = None
    else:
        created = item['created_parsed']
    if not created:
        # XXX use HTTP last-modified date here
        created = time.gmtime()
        # feeds that do not have timestamps cannot be garbage-collected
        # XXX need to find a better heuristic, as high-volume sites such as
        # XXX The Guardian, CNET.com or Salon.com lack item-level timestamps
        f['oldest'] = '1970-01-01 00:00:00'
    created = fix_date(created)
    item['created'] = time.strftime(date_fmt, created)
    # keep track of the oldest item still in the feed file
    if 'oldest' not in f:
        f['oldest'] = '9999-99-99 99:99:99'
    if item['created'] < f['oldest']:
        f['oldest'] = item['created']
    # finish modified date
    if 'modified_parsed' in item and item['modified_parsed']:
        modified = fix_date(item['modified_parsed'])
        # add a fudge factor time window within which modifications are not
        # counted as such, 10 minutes here
        if not modified or abs(time.mktime(modified) -
                               time.mktime(created)) < 600:
            item['modified'] = None
        else:
            item['modified'] = time.strftime(date_fmt, modified)
    else:
        item['modified'] = None
    ########################################################################
    # content
    if 'content' in item:
        content = item['content']
    elif 'content_encoded' in item:
        content = item['content_encoded']
    elif 'description' in item:
        content = item['description']
    else:
        content = '<a href="' + item['link'] + '">' + item['title'] + '</a>'
    if not content:
        content = '<a href="' + item['link'] + '">' + item['title'] + '</a>'
    # strip embedded NULs as a defensive measure
    content = content.replace('\0', '')
    # apply ad filters and other degunking to content
    old_content = None
    while old_content != content:
        old_content = content
        try:
            for filter in transform.filter_list:
                content = filter.apply(content, f, item)
        except:
            util.print_stack(black_list=['item'])
    # balance tags like <b>...</b>
    content = balance(content)
    content_lc = lower(content)
    # the content might have invalid 8-bit characters.
    # Heuristic suggested by Georg Bauer
    if type(content) != unicode:
        try:
            content = content.decode('utf-8')
        except UnicodeError:
            content = content.decode('iso-8859-1')
    #
    item['content'] = content
    # we recalculate this as content may have changed due to tag rebalancing, etc
    item['content_lc'] = lower(content)
    item['content_words_exact'] = get_words(item['content_lc'])
    item['content_words'] = stem(item['content_words_exact'])
    item['union_lc'] = item['title_lc'] + '\n' + item['content_lc']
    item['union_words'] = item['title_words'].union(item['content_words'])
    item['urls'] = url_re.findall(content)
    ########################################################################
    # categories/tags
    # we used 'category' before, but 'category' and 'categories' are
    # intercepted by feedparser.FeedParserDict.__getitemm__ and treated as
    # special case
    if 'tags' in item and type(item['tags']) == list:
        item['item_tags'] = set([lower(t['term']) for t in item['tags']])
    else:
        item['item_tags'] = []
    ########################################################################
    # map unicode
    for key in ['title', 'link', 'created', 'modified', 'author', 'content']:
        if type(item.get(key)) == unicode:
            item[key] = item[key].encode('ascii', 'xmlcharrefreplace')
    # hash the content as the GUID if required
    if item['id'] == 'HASH_CONTENT':
        item['id'] = hashlib.md5(item['title'] + item['content']).hexdigest()
Ejemplo n.º 24
0
def process_parsed_feed(db, c, f, feed_uid, feed_dupcheck=None, exempt=None):
  """Insert the entries from a feedparser parsed feed f in the database using
the cursor c for feed feed_uid.
Returns a tuple (number of items added unread, number of filtered items)"""
  num_added = 0
  num_filtered = 0
  filters.load_rules(db, c)
  # check if duplicate title checking is in effect
  if feed_dupcheck is None:
    c.execute("select feed_dupcheck from fm_feeds where feed_uid=?",
              [feed_uid])
    feed_dupcheck = bool(c.fetchone()[0])
  # check if the feed is exempt from filtering
  if exempt is None:
    c.execute("select feed_exempt from fm_feeds where feed_uid=?", [feed_uid])
    exempt = bool(c.fetchone()[0])
  # the Radio convention is reverse chronological order
  f['items'].reverse()
  for item in f['items']:
    try:
      normalize.normalize(item, f)
    except:
      util.print_stack()
      continue
    # evaluate the FilteringRules
    skip, rule = filters.evaluate_rules(item, f, feed_uid, exempt)
    filtered_by = None
    if skip:
      skip = -2
      if type(rule.uid) == int:
        filtered_by = rule.uid
      else:
        # XXX clunky convention for feed_rule, but that should disappear
        # XXX eventually
        filtered_by = 0
    title   = item['title']
    link    = item['link']
    guid    = item['id']
    author = item['author']
    created = item['created']
    modified = item['modified']
    if not modified:
      modified = None
    content = item['content']
    # check if the item already exists, using the GUID as key
    # but cache all seen GUIDs in a dictionary first, since most articles are
    # existing ones and we can save a database query this way
    if feed_uid in feed_guid_cache and guid in feed_guid_cache[feed_uid]:
      # existing entry and we've seen it before in this process instance
      # update the time stamp to prevent premature garbage-collection
      # in prune_feed_guid_cache
      feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time()
      continue
    else:
      feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time()
    # not seen yet, it may or may not be a duplicate, we have to find out the
    # hard way
    c.execute("""select item_uid, item_link,
    item_loaded, item_created, item_modified,
    item_md5hex, item_title, item_content, item_creator
    from fm_items where item_feed_uid=? and item_guid=?""",
              [feed_uid, guid])
    l = c.fetchall()
    # unknown GUID, but title/link duplicate checking may be in effect
    if not l:
      if feed_dupcheck:
        c.execute("""select count(*) from fm_items
        where item_feed_uid=? and (item_title=? or item_link=?)""",
                  [feed_uid, title, link])
        l = bool(c.fetchone()[0])
        if l:
          print >> param.activity, 'DUPLICATE TITLE', title
      # XXX Runt items (see normalize.py) are almost always spurious, we just
      # XXX skip them, although we may revisit this decision in the future
      if not l and item.get('RUNT', False):
        print >> param.activity, 'RUNT ITEM', item
        l = True
    # GUID already exists, this is a change
    else:
      assert len(l) == 1
      (item_uid, item_link, item_loaded, item_created, item_modified,
       item_md5hex, item_title, item_content, item_creator) = l[0]
      # if this is a feed without timestamps, use our timestamp to determine
      # the oldest item in the feed XML file
      if 'oldest' in f and f['oldest'] == '1970-01-01 00:00:00':
        if 'oldest_ts' not in f:
          f['oldest_ts'] = item_created
        else:
          f['oldest_ts'] = min(f['oldest_ts'], item_created)
      # XXX update item here
      # XXX update tags if required
    # GUID doesn't exist yet, insert it
    if not l:
      # finally, dereference the URL to get rid of annoying tracking servers
      # like feedburner, but only do this once to avoid wasting bandwidth
      link = normalize.dereference(link)
      try:
        c.execute("""insert into fm_items (item_feed_uid, item_guid,
        item_created,   item_modified, item_link, item_md5hex,
        item_title, item_content, item_creator, item_rating, item_rule_uid)
        values
        (?, ?, julianday(?), julianday(?), ?, ?, ?, ?, ?, ?, ?)""",
                  [feed_uid, guid, created, modified, link,
                   hashlib.md5(content).hexdigest(),
                   title, content, author, skip, filtered_by])
        # if we have tags, insert them
        # note: feedparser.py handles 'category' as a special case, so we
        # need to work around that to get to the data
        if item['item_tags']:
          c.execute("""select item_uid
          from fm_items where item_feed_uid=? and item_guid=?""",
                    [feed_uid, guid])
          item_uid = c.fetchone()[0]
          for tag in item['item_tags']:
            c.execute("""insert or ignore into fm_tags (tag_name, tag_item_uid)
            values (?, ?)""", [tag, item_uid])
        if skip:
          num_filtered += 1
          print >> param.activity, 'SKIP', title, rule
        else:
          num_added += 1
          print >> param.activity, ' ' * 4, title
      except:
        util.print_stack(['c', 'f'])
        continue
  # update timestamp of the oldest item still in the feed file
  if 'oldest' in f and f['oldest'] != '9999-99-99 99:99:99':
    if f['oldest'] == '1970-01-01 00:00:00' and 'oldest_ts' in f:
      c.execute("update fm_feeds set feed_oldest=? where feed_uid=?",
                [f['oldest_ts'], feed_uid])
    else:
      c.execute("""update fm_feeds set feed_oldest=julianday(?)
      where feed_uid=?""", [f['oldest'], feed_uid])
  
  return (num_added, num_filtered)
Ejemplo n.º 25
0
    return url
  except (urllib2.URLError, ValueError, socket.error):
    return url
  except Redirect, e:
    # break a redirection loop if it occurs
    if e.url in seen:
      return url
    # some servers redirect to Unicode URLs, which are not legal
    try:
      unicode(e.url)
    except UnicodeDecodeError:
      return url
    # there might be several levels of redirection
    return dereference(e.url, seen, level + 1)
  except:
    util.print_stack()
    return url
  
url_re = re.compile('(?:href|src)="([^"]*)"', re.IGNORECASE)

def normalize(item, f, run_filters=True):
  # get rid of RDF lossage...
  for key in ['title', 'link', 'created', 'modified', 'author',
              'content', 'content_encoded', 'description']:
    if type(item.get(key)) == list:
      if len(item[key]) == 1:
        item[key] = item[key][0]
      else:
        candidate = [i for i in item[key] if i.get('type') == 'text/html']
        if len(candidate) > 1 and key == 'content':
          candidate = sorted(candidate,
Ejemplo n.º 26
0
def add_feed(feed_xml):
  """Try to add a feed. Returns a tuple (feed_uid, num_added, num_filtered)"""
  from singleton import db
  c = db.cursor()
  feed_xml = feed_xml.replace('feed://', 'http://')
  try:
    # verify the feed
    f = feedparser.parse(feed_xml)
    # CVS versions of feedparser are not throwing exceptions as they should
    # see:
    # http://sourceforge.net/tracker/index.php?func=detail&aid=1379172&group_id=112328&atid=661937
    if not f.feed or ('link' not in f.feed or 'title' not in f.feed):
      # some feeds have multiple links, one for self and one for PuSH
      if f.feed and 'link' not in f.feed and 'links' in f.feed:
        try:
          for l in f.feed['links']:
            if l['rel'] == 'self':
              f.feed['link'] = l['href']
        except KeyError:
          pass
    if not f.feed or ('link' not in f.feed or 'title' not in f.feed):
      # try autodiscovery
      try:
        feed_xml = AutoDiscoveryHandler().feed_url(feed_xml)
      except HTMLParser.HTMLParseError:
        # in desperate conditions, regexps ride to the rescue
        try:
          feed_xml = re_autodiscovery(feed_xml)[0][1]
        except:
          util.print_stack()
          raise AutodiscoveryParseError
      if not feed_xml:
        raise ParseError
      f = feedparser.parse(feed_xml)
      if not f.feed:
        raise ParseError
    # we have a valid feed, normalize it
    normalize.normalize_feed(f)
    feed = {
      'xmlUrl': f['url'],
      'htmlUrl': str(f.feed['link']),
      'etag': f.get('etag'),
      'title': f.feed['title'].encode('ascii', 'xmlcharrefreplace'),
      'desc': f.feed['description'].encode('ascii', 'xmlcharrefreplace')
      }
    for key, value in feed.items():
      if type(value) == str:
        feed[key] = value
    filters.load_rules(db, c)
    try:
      c.execute("""insert into fm_feeds
      (feed_xml, feed_etag, feed_html, feed_title, feed_desc) values
      (:xmlUrl, :etag, :htmlUrl, :title, :desc)""", feed)
      feed_uid = c.lastrowid
      num_added, num_filtered = process_parsed_feed(db, c, f, feed_uid)
      db.commit()
      return feed_uid, feed['title'], num_added, num_filtered
    except sqlite.IntegrityError, e:
      if 'feed_xml' in str(e):
        db.rollback()
        raise FeedAlreadyExists
      else:
        db.rollback()
        raise UnknownError(str(e))
  finally:
    c.close()