def run(self): while True: uid, url = in_q.get() if uid is None: out_q.put((None, None, None)) return new_url = normalize.dereference(url) if url != new_url: out_q.put((uid, url, new_url))
def process_parsed_feed(db, c, f, feed_uid, feed_dupcheck=None, exempt=None): """Insert the entries from a feedparser parsed feed f in the database using the cursor c for feed feed_uid. Returns a tuple (number of items added unread, number of filtered items)""" num_added = 0 num_filtered = 0 filters.load_rules(c) # check if duplicate title checking is in effect if feed_dupcheck is None: c.execute("select feed_dupcheck from fm_feeds where feed_uid=?", [feed_uid]) feed_dupcheck = bool(c.fetchone()[0]) # check if the feed is exempt from filtering if exempt is None: c.execute("select feed_exempt from fm_feeds where feed_uid=?", [feed_uid]) exempt = bool(c.fetchone()[0]) # the Radio convention is reverse chronological order f['items'].reverse() for item in f['items']: try: normalize.normalize(item, f) except: util.print_stack() continue # evaluate the FilteringRules skip, rule = filters.evaluate_rules(item, f, feed_uid, exempt) filtered_by = None if skip: skip = -2 if type(rule.uid) == int: filtered_by = rule.uid else: # XXX clunky convention for feed_rule, but that should disappear # XXX eventually filtered_by = 0 title = item['title'] link = item['link'] guid = item['id'] author = item['author'] created = item['created'] modified = item['modified'] if not modified: modified = None content = item['content'] # check if the item already exists, using the GUID as key # but cache all seen GUIDs in a dictionary first, since most articles are # existing ones and we can save a database query this way if feed_uid in feed_guid_cache and guid in feed_guid_cache[feed_uid]: # existing entry and we've seen it before in this process instance # update the time stamp to prevent premature garbage-collection # in prune_feed_guid_cache feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time() continue else: feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time() # not seen yet, it may or may not be a duplicate, we have to find out the # hard way c.execute("""select item_uid, item_link, item_loaded, item_created, item_modified, item_md5hex, item_title, item_content, item_creator from fm_items where item_feed_uid=? and item_guid=?""", [feed_uid, guid]) l = c.fetchall() # unknown GUID, but title/link duplicate checking may be in effect if not l: if feed_dupcheck: c.execute("""select count(*) from fm_items where item_feed_uid=? and (item_title=? or item_link=?)""", [feed_uid, title, link]) l = bool(c.fetchone()[0]) if l: print >> param.activity, 'DUPLICATE TITLE', title # XXX Runt items (see normalize.py) are almost always spurious, we just # XXX skip them, although we may revisit this decision in the future if not l and item.get('RUNT', False): print >> param.activity, 'RUNT ITEM', item l = True # GUID already exists, this is a change else: assert len(l) == 1 (item_uid, item_link, item_loaded, item_created, item_modified, item_md5hex, item_title, item_content, item_creator) = l[0] # if this is a feed without timestamps, use our timestamp to determine # the oldest item in the feed XML file if 'oldest' in f and f['oldest'] == '1970-01-01 00:00:00': if 'oldest_ts' not in f: f['oldest_ts'] = item_created else: f['oldest_ts'] = min(f['oldest_ts'], item_created) # XXX update item here # XXX update tags if required # GUID doesn't exist yet, insert it if not l: # finally, dereference the URL to get rid of annoying tracking servers # like feedburner, but only do this once to avoid wasting bandwidth link = normalize.dereference(link) try: c.execute("""insert into fm_items (item_feed_uid, item_guid, item_created, item_modified, item_link, item_md5hex, item_title, item_content, item_creator, item_rating, item_rule_uid) values (?, ?, julianday(?), julianday(?), ?, ?, ?, ?, ?, ?, ?)""", [feed_uid, guid, created, modified, link, hashlib.md5(content).hexdigest(), title, content, author, skip, filtered_by]) # if we have tags, insert them # note: feedparser.py handles 'category' as a special case, so we # need to work around that to get to the data if item['item_tags']: c.execute("""select item_uid from fm_items where item_feed_uid=? and item_guid=?""", [feed_uid, guid]) item_uid = c.fetchone()[0] for tag in item['item_tags']: c.execute("""insert or ignore into fm_tags (tag_name, tag_item_uid) values (?, ?)""", [tag, item_uid]) if skip: num_filtered += 1 print >> param.activity, 'SKIP', title, rule else: num_added += 1 print >> param.activity, ' ' * 4, title except: util.print_stack(['c', 'f']) continue # update timestamp of the oldest item still in the feed file if 'oldest' in f and f['oldest'] != '9999-99-99 99:99:99': if f['oldest'] == '1970-01-01 00:00:00' and 'oldest_ts' in f: c.execute("update fm_feeds set feed_oldest=? where feed_uid=?", [f['oldest_ts'], feed_uid]) else: c.execute("""update fm_feeds set feed_oldest=julianday(?) where feed_uid=?""", [f['oldest'], feed_uid]) return (num_added, num_filtered)
def process_parsed_feed(db, c, f, feed_uid, feed_dupcheck=None, exempt=None): """Insert the entries from a feedparser parsed feed f in the database using the cursor c for feed feed_uid. Returns a tuple (number of items added unread, number of filtered items)""" num_added = 0 num_filtered = 0 filters.load_rules(db, c) # check if duplicate title checking is in effect if feed_dupcheck is None: c.execute("select feed_dupcheck from fm_feeds where feed_uid=?", [feed_uid]) feed_dupcheck = bool(c.fetchone()[0]) # check if the feed is exempt from filtering if exempt is None: c.execute("select feed_exempt from fm_feeds where feed_uid=?", [feed_uid]) exempt = bool(c.fetchone()[0]) # the Radio convention is reverse chronological order f['items'].reverse() for item in f['items']: try: normalize.normalize(item, f) except: util.print_stack() continue # evaluate the FilteringRules skip, rule = filters.evaluate_rules(item, f, feed_uid, exempt) filtered_by = None if skip: skip = -2 if type(rule.uid) == int: filtered_by = rule.uid else: # XXX clunky convention for feed_rule, but that should disappear # XXX eventually filtered_by = 0 title = item['title'] link = item['link'] guid = item['id'] author = item['author'] created = item['created'] modified = item['modified'] if not modified: modified = None content = item['content'] # check if the item already exists, using the GUID as key # but cache all seen GUIDs in a dictionary first, since most articles are # existing ones and we can save a database query this way if feed_uid in feed_guid_cache and guid in feed_guid_cache[feed_uid]: # existing entry and we've seen it before in this process instance # update the time stamp to prevent premature garbage-collection # in prune_feed_guid_cache feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time() continue else: feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time() # not seen yet, it may or may not be a duplicate, we have to find out the # hard way c.execute("""select item_uid, item_link, item_loaded, item_created, item_modified, item_md5hex, item_title, item_content, item_creator from fm_items where item_feed_uid=? and item_guid=?""", [feed_uid, guid]) l = c.fetchall() # unknown GUID, but title/link duplicate checking may be in effect if not l: if feed_dupcheck: c.execute("""select count(*) from fm_items where item_feed_uid=? and (item_title=? or item_link=?)""", [feed_uid, title, link]) l = bool(c.fetchone()[0]) if l: print >> param.activity, 'DUPLICATE TITLE', title # XXX Runt items (see normalize.py) are almost always spurious, we just # XXX skip them, although we may revisit this decision in the future if not l and item.get('RUNT', False): print >> param.activity, 'RUNT ITEM', item l = True # GUID already exists, this is a change else: assert len(l) == 1 (item_uid, item_link, item_loaded, item_created, item_modified, item_md5hex, item_title, item_content, item_creator) = l[0] # if this is a feed without timestamps, use our timestamp to determine # the oldest item in the feed XML file if 'oldest' in f and f['oldest'] == '1970-01-01 00:00:00': if 'oldest_ts' not in f: f['oldest_ts'] = item_created else: f['oldest_ts'] = min(f['oldest_ts'], item_created) # XXX update item here # XXX update tags if required # GUID doesn't exist yet, insert it if not l: # finally, dereference the URL to get rid of annoying tracking servers # like feedburner, but only do this once to avoid wasting bandwidth link = normalize.dereference(link) try: c.execute("""insert into fm_items (item_feed_uid, item_guid, item_created, item_modified, item_link, item_md5hex, item_title, item_content, item_creator, item_rating, item_rule_uid) values (?, ?, julianday(?), julianday(?), ?, ?, ?, ?, ?, ?, ?)""", [feed_uid, guid, created, modified, link, hashlib.md5(content).hexdigest(), title, content, author, skip, filtered_by]) # if we have tags, insert them # note: feedparser.py handles 'category' as a special case, so we # need to work around that to get to the data if item['item_tags']: c.execute("""select item_uid from fm_items where item_feed_uid=? and item_guid=?""", [feed_uid, guid]) item_uid = c.fetchone()[0] for tag in item['item_tags']: c.execute("""insert or ignore into fm_tags (tag_name, tag_item_uid) values (?, ?)""", [tag, item_uid]) if skip: num_filtered += 1 print >> param.activity, 'SKIP', title, rule else: num_added += 1 print >> param.activity, ' ' * 4, title except: util.print_stack(['c', 'f']) continue # update timestamp of the oldest item still in the feed file if 'oldest' in f and f['oldest'] != '9999-99-99 99:99:99': if f['oldest'] == '1970-01-01 00:00:00' and 'oldest_ts' in f: c.execute("update fm_feeds set feed_oldest=? where feed_uid=?", [f['oldest_ts'], feed_uid]) else: c.execute("""update fm_feeds set feed_oldest=julianday(?) where feed_uid=?""", [f['oldest'], feed_uid]) return (num_added, num_filtered)