async def livereload_js(request: web.Request) -> web.Response: if request.if_modified_since: raise HTTPNotModified() lr_script = request.app['livereload_script'] return web.Response(body=lr_script, content_type='application/javascript', headers={LAST_MODIFIED: 'Fri, 01 Jan 2016 00:00:00 GMT'})
async def livereload_js(request): if request.if_modified_since: aux_logger.debug('> %s %s %s 0B', request.method, request.path, 304) raise HTTPNotModified() script_key = 'livereload_script' lr_script = request.app.get(script_key) if lr_script is None: lr_path = Path(__file__).absolute().parent.joinpath('livereload.js') with lr_path.open('rb') as f: lr_script = f.read() request.app[script_key] = lr_script aux_logger.debug('> %s %s %s %s', request.method, request.path, 200, fmt_size(len(lr_script))) return web.Response(body=lr_script, content_type='application/javascript', headers={LAST_MODIFIED: 'Fri, 01 Jan 2016 00:00:00 GMT'})
def send(self, request, filepath): """Send filepath to client using request.""" gzip = False if 'gzip' in request.headers[hdrs.ACCEPT_ENCODING]: gzip_path = filepath.with_name(filepath.name + '.gz') if gzip_path.is_file(): filepath = gzip_path gzip = True st = filepath.stat() modsince = request.if_modified_since if modsince is not None and st.st_mtime <= modsince.timestamp(): raise HTTPNotModified() ct, encoding = mimetypes.guess_type(str(filepath)) if not ct: ct = 'application/octet-stream' resp = self._response_factory() resp.content_type = ct if encoding: resp.headers[hdrs.CONTENT_ENCODING] = encoding if gzip: resp.headers[hdrs.VARY] = hdrs.ACCEPT_ENCODING resp.last_modified = st.st_mtime # CACHE HACK if not self.development: cache_time = 31 * 86400 # = 1 month resp.headers[hdrs.CACHE_CONTROL] = "public, max-age={}".format( cache_time) file_size = st.st_size resp.content_length = file_size resp.set_tcp_cork(True) try: with filepath.open('rb') as f: yield from self._sendfile(request, resp, f, file_size) finally: resp.set_tcp_nodelay(True) return resp
def background_check_feed(conn, feed, asyncioloop): logger.info(feed + ': Starting up background_check_feed') # Try to wait until Discord client has connected, etc: yield from client.wait_until_ready() # make sure debug output has this check run in the right order... yield from asyncio.sleep(1) user_agent = config["MAIN"].get("user_agent", USER_AGENT) # just a bit easier to use... FEED = config[feed] # pull config for this feed out: feed_url = FEED.get('feed_url') rss_refresh_time = FEED.getint('rss_refresh_time', 3600) start_skew = FEED.getint('start_skew', rss_refresh_time) start_skew_min = FEED.getint('start_skew_min', 1) max_age = FEED.getint('max_age', 86400) # loop through all the channels this feed is configured to send to channels = [] for key in FEED.get('channels').split(','): logger.debug(feed + ': adding channel ' + key) # stick a dict in the channels array so we have more to work with channels.append( { 'object': discord.Object(id=config['CHANNELS'][key]), 'name': key, 'id': config['CHANNELS'][key], } ) if start_skew > 0: sleep_time = random.uniform(start_skew_min, start_skew) logger.info(feed + ':start_skew:sleeping for ' + str(sleep_time)) yield from asyncio.sleep(sleep_time) # Basically run forever while not client.is_closed: # And tries to catch all the exceptions and just keep going # (but see list of except/finally stuff below) try: logger.info(feed + ': processing feed') # If send_typing is on for the feed, send a little "typing ..." # whenever a feed is being worked on. configurable per-room if should_send_typing(FEED, feed): for channel in channels: # Since this is first attempt to talk to this channel, # be very verbose about failures to talk to channel try: yield from client.send_typing(channel['object']) except discord.errors.Forbidden: logger.exception( "%s:%s:forbidden - is bot allowed in channel?", feed, channel ) http_headers = {"User-Agent": user_agent} # Download the actual feed, if changed since last fetch # pull data about history of this *feed* from DB: cursor = conn.cursor() cursor.execute( "select lastmodified,etag from feed_info where feed=? OR url=?", [ feed, feed_url]) data = cursor.fetchone() # If we've handled this feed before, # and we have etag from last run, add etag to headers. # and if we have a last modified time from last run, # add "If-Modified-Since" to headers. if data is None: # never handled this feed before... logger.info(feed + ':looks like updated version. saving info') cursor.execute( "REPLACE INTO feed_info (feed,url) VALUES (?,?)", [feed, feed_url]) conn.commit() logger.debug(feed + ':feed info saved') else: logger.debug(feed + ':setting up extra headers for HTTP request.') logger.debug(data) lastmodified = data[0] etag = data[1] if lastmodified is not None and len(lastmodified): logger.debug(feed + ':adding header If-Modified-Since: ' + lastmodified) http_headers['If-Modified-Since'] = lastmodified else: logger.debug(feed + ':no stored lastmodified') if etag is not None and len(etag): logger.debug(feed + ':adding header ETag: ' + etag) http_headers['ETag'] = etag else: logger.debug(feed + ':no stored ETag') logger.debug(feed + ':sending http request for ' + feed_url) # Send actual request. yield from can yield control to another # instance. http_response = yield from httpclient.request('GET', feed_url, headers=http_headers) logger.debug(http_response) # First check that we didn't get a "None" response, since that's # some sort of internal error thing: if http_response.status is None: logger.error(feed + ':HTTP response code is NONE') raise HTTPError() # Some feeds are smart enough to use that if-modified-since or # etag info, which gives us a 304 status. If that happens, # assume no new items, fall through rest of this and try again # later. elif http_response.status == 304: logger.debug(feed + ':data is old; moving on') http_response.close() raise HTTPNotModified() # If we get anything but a 200, that's a problem and we don't # have good data, so give up and try later. # Mostly handled different than 304/not-modified to make logging # clearer. elif http_response.status != 200: logger.debug(feed + ':HTTP error not 200') # + str(http_response.status)) # raise HTTPError() else: logger.debug(feed + ':HTTP success') # pull data out of the http response logger.debug(feed + ':reading http response') http_data = yield from http_response.read() # parse the data from the http response with feedparser logger.debug(feed + ':parsing http data') feed_data = feedparser.parse(http_data) logger.debug(feed + ':done fetching') # If we got an ETAG back in headers, store that, so we can # include on next fetch if 'ETAG' in http_response.headers: etag = http_response.headers['ETAG'] logger.debug(feed + ':saving etag: ' + etag) cursor.execute( "UPDATE feed_info SET etag=? where feed=? or url=?", [etag, feed, feed_url]) conn.commit() logger.debug(feed + ':etag saved') else: logger.debug(feed + ':no etag') # If we got a Last-Modified header back, store that, so we can # include on next fetch if 'LAST-MODIFIED' in http_response.headers: modified = http_response.headers['LAST-MODIFIED'] logger.debug(feed + ':saving lastmodified: ' + modified) cursor.execute( "UPDATE feed_info SET lastmodified=? where feed=? or url=?", [ modified, feed, feed_url]) conn.commit() logger.debug(feed + ':saved lastmodified') else: logger.debug(feed + ':no last modified date') http_response.close() # Process all of the entries in the feed # Use reversed to start with end, which is usually oldest logger.debug(feed + ':processing entries') for item in reversed(feed_data.entries): logger.debug("%s:item:processing this entry:%r", feed, item) # Pull out the unique id, or just give up on this item. id = '' if 'id' in item: id = item.id elif 'guid' in item: id = item.guid elif 'link' in item: id = item.link else: logger.error(feed + ':item:no id, skipping') continue # Get our best date out, in both raw and parsed form pubdate = extract_best_item_date(item, TIMEZONE) pubdate_fmt = pubdate.strftime("%a %b %d %H:%M:%S %Z %Y") logger.debug(feed + ':item:id:' + id) logger.debug(feed + ':item:checking database history for this item') # Check DB for this item cursor.execute( "SELECT published,title,url,reposted FROM feed_items WHERE id=?", [id]) data = cursor.fetchone() # If we've never seen it before, then actually processing # this: if data is None: logger.info(feed + ':item ' + id + ' unseen, processing:') # Store info about this item, so next time we skip it: cursor.execute( "INSERT INTO feed_items (id,published) VALUES (?,?)", [id, pubdate_fmt]) conn.commit() # Doing some crazy date math stuff... # max_age is mostly so that first run doesn't spew too # much stuff into a room, but is also a useful safety # measure in case a feed suddenly reverts to something # ancient or other weird problems... time_since_published = TIMEZONE.localize( datetime.now()) - pubdate.astimezone(TIMEZONE) if time_since_published.total_seconds() < max_age: logger.info(feed + ':item:fresh and ready for parsing') # Loop over all channels for this particular feed # and process appropriately: for channel in channels: include = True filter_field = FEED.get( channel['name'] + '.filter_field', FEED.get('filter_field', 'title')) # Regex if channel exists if (channel['name'] + '.filter') in FEED or 'filter' in FEED: logger.debug( feed + ':item:running filter for' + channel['name']) regexpat = FEED.get( channel['name'] + '.filter', FEED.get('filter', '^.*$')) logger.debug( feed + ':item:using filter:' + regexpat + ' on ' + item['title'] + ' field ' + filter_field) regexmatch = re.search( regexpat, item[filter_field]) if regexmatch is None: include = False logger.info( feed + ':item:failed filter for ' + channel['name']) elif (channel['name'] + '.filter_exclude') in FEED or 'filter_exclude' in FEED: logger.debug( feed + ':item:running exclude filter for' + channel['name']) regexpat = FEED.get( channel['name'] + '.filter_exclude', FEED.get('filter_exclude', '^.*$')) logger.debug( feed + ':item:using filter_exclude:' + regexpat + ' on ' + item['title'] + ' field ' + filter_field) regexmatch = re.search(regexpat, item[filter_field]) if regexmatch is None: include = True logger.info( feed + ':item:passed exclude filter for ' + channel['name']) else: include = False logger.info( feed + ':item:failed exclude filter for ' + channel['name']) else: include = True # redundant safety net logger.debug( feed + ':item:no filter configured for' + channel['name']) if include is True: logger.debug( feed + ':item:building message for ' + channel['name']) message = build_message(FEED, item, channel) logger.debug( feed + ':item:sending message (eventually) to ' + channel['name']) yield from send_message_wrapper(asyncioloop, FEED, feed, channel, client, message) else: logger.info( feed + ':item:skipping item due to not passing filter for ' + channel['name']) else: # Logs of debugging info for date handling stuff... logger.info("%s:too old, skipping", feed) logger.debug("%s:now:now:%s", feed, time.time()) logger.debug("%s:now:gmtime:%s", feed, time.gmtime()) logger.debug("%s:now:localtime:%s", feed, time.localtime()) logger.debug("%s:pubDate:%r", feed, pubdate) logger.debug(item) # seen before, move on: else: logger.debug(feed + ':item:' + id + ' seen before, skipping') # This is completely expected behavior for a well-behaved feed: except HTTPNotModified: logger.debug( feed + ':Headers indicate feed unchanged since last time fetched:') logger.debug(sys.exc_info()) # Many feeds have random periodic problems that shouldn't cause # permanent death: except HTTPError: logger.warn(feed + ':Unexpected HTTP error:') logger.warn(sys.exc_info()) logger.warn( feed + ':Assuming error is transient and trying again later') # sqlite3 errors are probably really bad and we should just totally # give up on life except sqlite3.Error as sqlerr: logger.error(feed + ':sqlite3 error: ') logger.error(sys.exc_info()) logger.error(sqlerr) raise # Ideally we'd remove the specific channel or something... # But I guess just throw an error into the log and try again later... except discord.errors.Forbidden: logger.error(feed + ':discord.errors.Forbidden') logger.error(sys.exc_info()) logger.error( feed + ":Perhaps bot isn't allowed in one of the channels for this feed?") # raise # or not? hmm... # unknown error: definitely give up and die and move on except Exception: logger.exception("Unexpected error - giving up") raise # No matter what goes wrong, wait same time and try again finally: logger.debug(feed + ':sleeping for ' + str(rss_refresh_time) + ' seconds') yield from asyncio.sleep(rss_refresh_time)
def background_check_feed(feed): logger.info(feed + ': Starting up background_check_feed') yield from client.wait_until_ready() # make sure debug output has this check run in the right order... yield from asyncio.sleep(1) FEED = config[feed] feed_url = FEED.get('feed_url') rss_refresh_time = FEED.getint('rss_refresh_time', 3600) max_age = FEED.getint('max_age', 86400) channels = [] for key in FEED.get('channels').split(','): logger.debug(feed + ': adding channel ' + key) channels.append(discord.Object(id=config['CHANNELS'][key])) while not client.is_closed: try: logger.info(feed + ': processing feed') if FEED.getint('send_typing', 0) >= 1: for channel in channels: try: yield from client.send_typing(channel) except discord.errors.Forbidden: logger.error(feed + ':discord.errors.Forbidden') logger.error(sys.exc_info()) logger.error( feed + ":Perhaps bot isn't allowed in one this channel?") logger.error(channel) http_headers = {} http_headers['User-Agent'] = MAIN.get('UserAgent', 'feed2discord/1.0') ### Download the actual feed, if changed since last fetch cursor = conn.cursor() cursor.execute( "select lastmodified,etag from feed_info where feed=? OR url=?", [feed, feed_url]) data = cursor.fetchone() if data is None: logger.info(feed + ':looks like updated version. saving info') cursor.execute( "REPLACE INTO feed_info (feed,url) VALUES (?,?)", [feed, feed_url]) conn.commit() logger.debug(feed + ':feed info saved') else: logger.debug(feed + ':setting up extra headers for HTTP request.') logger.debug(data) lastmodified = data[0] etag = data[1] if lastmodified is not None and len(lastmodified): logger.debug(feed + ':adding header If-Modified-Since: ' + lastmodified) http_headers['If-Modified-Since'] = lastmodified else: logger.debug(feed + ':no stored lastmodified') if etag is not None and len(etag): logger.debug(feed + ':adding header ETag: ' + etag) http_headers['ETag'] = etag else: logger.debug(feed + ':no stored ETag') logger.debug(feed + ':sending http request for ' + feed_url) http_response = yield from httpclient.request('GET', feed_url, headers=http_headers) logger.debug(http_response) if http_response.status == 304: logger.debug(feed + ':data is old; moving on') http_response.close() raise HTTPNotModified() elif http_response.status != 200: logger.debug(feed + ':HTTP error: ' + http_response.status) http_response.close() raise HTTPError() else: logger.debug(feed + ':HTTP success') logger.debug(feed + ':reading http response') http_data = yield from http_response.read() logger.debug(feed + ':parsing http data') feed_data = feedparser.parse(http_data) logger.debug(feed + ':done fetching') if 'ETAG' in http_response.headers: etag = http_response.headers['ETAG'] logger.debug(feed + ':saving etag: ' + etag) cursor.execute( "UPDATE feed_info SET etag=? where feed=? or url=?", [etag, feed, feed_url]) conn.commit() logger.debug(feed + ':etag saved') else: logger.debug(feed + ':no etag') if 'LAST-MODIFIED' in http_response.headers: modified = http_response.headers['LAST-MODIFIED'] logger.debug(feed + ':saving lastmodified: ' + modified) cursor.execute( "UPDATE feed_info SET lastmodified=? where feed=? or url=?", [modified, feed, feed_url]) conn.commit() logger.debug(feed + ':saved lastmodified') else: logger.debug(feed + ':no last modified date') http_response.close() logger.debug(feed + ':processing entries') for item in feed_data.entries: logger.debug(feed + ':item:processing this entry') # logger.debug(item) # can be very noisy id = '' if 'id' in item: id = item.id elif 'guid' in item: id = item.guid else: logger.error(feed + ':item:no id, skipping') continue pubDateDict = extract_best_item_date(item) pubDate = pubDateDict['date'] pubDate_parsed = pubDateDict['date_parsed'] logger.debug(feed + ':item:id:' + id) logger.debug(feed + ':item:checking database history for this item') cursor.execute( "SELECT published,title,url,reposted FROM feed_items WHERE id=?", [id]) data = cursor.fetchone() if data is None: logger.info(feed + ':item ' + id + ' unseen, processing:') cursor.execute( "INSERT INTO feed_items (id,published) VALUES (?,?)", [id, pubDate]) conn.commit() if time.mktime(pubDate_parsed) > (time.time() - max_age): logger.info(feed + ':item:fresh and ready for parsing') logger.debug(feed + ':item:building message') message = build_message(FEED, item) for channel in channels: logger.debug(feed + ':item:sending message') yield from client.send_message(channel, message) else: logger.info(feed + ':too old; skipping') logger.debug(feed + ':now:' + str(time.time())) logger.debug(feed + ':now:gmtime:' + str(time.gmtime())) logger.debug(feed + ':now:localtime:' + str(time.localtime())) logger.debug(feed + ':pubDate:' + str(pubDate)) logger.debug(feed + ':pubDate_parsed:' + str(pubDate_parsed)) if debug >= 4: logger.debug(item) else: logger.debug(feed + ':item:' + id + ' seen before, skipping') except HTTPNotModified: logger.debug( feed + ':Headers indicate feed unchanged since last time fetched:') logger.debug(sys.exc_info()) except HTTPError: logger.warn(feed + ':Unexpected HTTP error:') logger.warn(sys.exc_info()) logger.warn(feed + ':Assuming error is transient and trying again later') except sqlite3.Error as sqlerr: logger.error(feed + ':sqlite3 error: ') logger.error(sys.exc_info()) logger.error(sqlerr) raise except discord.errors.Forbidden: logger.error(feed + ':discord.errors.Forbidden') logger.error(sys.exc_info()) logger.error( feed + ":Perhaps bot isn't allowed in one of the channels for this feed?" ) # raise # or not? hmm... except: logger.error(feed + ':Unexpected error:') logger.error(sys.exc_info()) logger.error(feed + ':giving up') raise finally: # No matter what goes wrong, wait same time and try again logger.debug(feed + ':sleeping for ' + str(rss_refresh_time) + ' seconds') yield from asyncio.sleep(rss_refresh_time)
async def background_check_feed(feed, asyncioloop): logger.info(feed + ": Starting up background_check_feed") # Try to wait until Discord client has connected, etc: await client.wait_until_ready() # make sure debug output has this check run in the right order... await asyncio.sleep(1) user_agent = config["MAIN"].get("user_agent", USER_AGENT) # just a bit easier to use... FEED = config[feed] # pull config for this feed out: feed_url = FEED.get("feed_url") rss_refresh_time = FEED.getint("rss_refresh_time", 3600) start_skew = FEED.getint("start_skew", rss_refresh_time) start_skew_min = FEED.getint("start_skew_min", 1) max_age = FEED.getint("max_age", 86400) # loop through all the channels this feed is configured to send to channels = [] for key in FEED.get("channels").split(","): # stick a dict in the channels array so we have more to work with channel_id = config["CHANNELS"].getint(key) logger.info(feed + ": adding channel " + key + ":" + str(channel_id)) channel_obj = client.get_channel(channel_id) logger.info(pformat(channel_obj)) if channel_obj is not None: channels.append({ "object": channel_obj, "name": key, "id": channel_id }) logger.info(feed + ": added channel " + key) else: logger.warning(feed + ": did not add channel " + key + "/" + str(channel_id)) logger.warning(pformat(channel_obj)) if start_skew > 0: sleep_time = random.uniform(start_skew_min, start_skew) logger.info(feed + ":start_skew:sleeping for " + str(sleep_time)) await asyncio.sleep(sleep_time) # Basically run forever while True: # And try to catch all the exceptions and just keep going # (but see list of except/finally stuff below) try: # set current "game played" constantly so that it sticks around gameplayed = MAIN.get("gameplayed", "gitlab.com/ffreiheit/discord_feedbot") await client.change_presence(activity=discord.Game(name=gameplayed) ) logger.info(feed + ": processing feed") # If send_typing is on for the feed, send a little "typing ..." # whenever a feed is being worked on. configurable per-room if should_send_typing(FEED, feed): for channel in channels: # Since this is first attempt to talk to this channel, # be very verbose about failures to talk to channel try: await client.send_typing(channel["object"]) except discord.errors.Forbidden: logger.exception( "%s:%s:forbidden - is bot allowed in channel?", feed, channel, ) http_headers = {"User-Agent": user_agent} db_path = config["MAIN"].get("db_path", "feed2discord.db") # Debugging crazy issues logger.info(feed + ":db_debug:db_path=" + db_path) conn = sqlite3.connect(db_path) # Download the actual feed, if changed since last fetch # Debugging crazy issues logger.info(feed + ":db_debug:conn=" + type(conn).__name__) # pull data about history of this *feed* from DB: cursor = conn.execute( "select lastmodified,etag from feed_info where feed=? OR url=?", [feed, feed_url], ) data = cursor.fetchone() # If we've handled this feed before, # and we have etag from last run, add etag to headers. # and if we have a last modified time from last run, # add "If-Modified-Since" to headers. if data is None: # never handled this feed before... logger.info(feed + ":looks like updated version. saving info") conn.execute("REPLACE INTO feed_info (feed,url) VALUES (?,?)", [feed, feed_url]) conn.commit() logger.info(feed + ":feed info saved") else: logger.info(feed + ":setting up extra headers for HTTP request.") logger.info(data) lastmodified = data[0] etag = data[1] if lastmodified is not None and len(lastmodified): logger.info(feed + ":adding header If-Modified-Since: " + lastmodified) http_headers["If-Modified-Since"] = lastmodified else: logger.info(feed + ":no stored lastmodified") if etag is not None and len(etag): logger.info(feed + ":adding header ETag: " + etag) http_headers["ETag"] = etag else: logger.info(feed + ":no stored ETag") # Set up httpclient httpclient = aiohttp.ClientSession() logger.info(feed + ":sending http request for " + feed_url) # Send actual request. await can yield control to another # instance. http_response = await httpclient.get(feed_url, headers=http_headers) logger.info(http_response) # First check that we didn't get a "None" response, since that's # some sort of internal error thing: if http_response.status is None: logger.error(feed + ":HTTP response code is NONE") raise HTTPError() # Some feeds are smart enough to use that if-modified-since or # etag info, which gives us a 304 status. If that happens, # assume no new items, fall through rest of this and try again # later. elif http_response.status == 304: logger.info(feed + ":data is old; moving on") http_response.close() raise HTTPNotModified() # If we get anything but a 200, that's a problem and we don't # have good data, so give up and try later. # Mostly handled different than 304/not-modified to make logging # clearer. elif http_response.status != 200: logger.info(feed + ":HTTP error not 200") http_response.close() raise HTTPNotModified() else: logger.info(feed + ":HTTP success") # pull data out of the http response logger.info(feed + ":reading http response") http_data = await http_response.read() await httpclient.close() # parse the data from the http response with feedparser logger.info(feed + ":parsing http data") feed_data = feedparser.parse(http_data) logger.info(feed + ":done fetching") # If we got an ETAG back in headers, store that, so we can # include on next fetch if "ETAG" in http_response.headers: etag = http_response.headers["ETAG"] logger.info(feed + ":saving etag: " + etag) conn.execute( "UPDATE feed_info SET etag=? where feed=? or url=?", [etag, feed, feed_url], ) conn.commit() logger.info(feed + ":etag saved") else: logger.info(feed + ":no etag") # If we got a Last-Modified header back, store that, so we can # include on next fetch if "LAST-MODIFIED" in http_response.headers: modified = http_response.headers["LAST-MODIFIED"] logger.info(feed + ":saving lastmodified: " + modified) conn.execute( "UPDATE feed_info SET lastmodified=? where feed=? or url=?", [modified, feed, feed_url], ) conn.commit() logger.info(feed + ":saved lastmodified") else: logger.info(feed + ":no last modified date") http_response.close() # Process all of the entries in the feed # Use reversed to start with end, which is usually oldest logger.info(feed + ":processing entries") for item in reversed(feed_data.entries): # Pull out the unique id, or just give up on this item. id = "" if "id" in item: id = item.id elif "guid" in item: id = item.guid elif "link" in item: id = item.link else: logger.error(feed + ":item:no id, skipping") continue # Get our best date out, in both raw and parsed form pubdate = extract_best_item_date(item, TIMEZONE) pubdate_fmt = pubdate.strftime("%a %b %d %H:%M:%S %Z %Y") logger.info( "%s:item:processing this entry:%s:%s:%s", feed, id, pubdate_fmt, item.title, ) logger.info(feed + ":item:id:" + id) logger.info(feed + ":item:checking database history for this item") # Check DB for this item cursor = conn.execute( "SELECT published,title,url,reposted FROM feed_items WHERE id=?", [id], ) data = cursor.fetchone() # If we've never seen it before, then actually processing # this: if data is None: logger.info(feed + ":item " + id + " unseen, processing:") # Store info about this item, so next time we skip it: conn.execute( "INSERT INTO feed_items (id,published) VALUES (?,?)", [id, pubdate_fmt], ) conn.commit() # Doing some crazy date math stuff... # max_age is mostly so that first run doesn't spew too # much stuff into a room, but is also a useful safety # measure in case a feed suddenly reverts to something # ancient or other weird problems... time_since_published = TIMEZONE.localize( datetime.now()) - pubdate.astimezone(TIMEZONE) logger.debug( '%s:time_since_published.total_seconds:%s,max_age:%s', feed, time_since_published.total_seconds(), max_age) if time_since_published.total_seconds() < max_age: logger.info(feed + ":item:fresh and ready for parsing") # Loop over all channels for this particular feed # and process appropriately: for channel in channels: include = True filter_field = FEED.get( channel["name"] + ".filter_field", FEED.get("filter_field", "title"), ) # Regex if channel exists if (channel["name"] + ".filter") in FEED or "filter" in FEED: logger.info(feed + ":item:running filter for" + channel["name"]) regexpat = FEED.get( channel["name"] + ".filter", FEED.get("filter", "^.*$"), ) logger.info(feed + ":item:using filter:" + regexpat + " on " + item["title"] + " field " + filter_field) regexmatch = re.search( regexpat, process_field(filter_field, item, FEED, channel), ) if regexmatch is None: include = False logger.info(feed + ":item:failed filter for " + channel["name"]) elif (channel["name"] + ".filter_exclude" ) in FEED or "filter_exclude" in FEED: logger.info( feed + ":item:running exclude filter for" + channel["name"]) regexpat = FEED.get( channel["name"] + ".filter_exclude", FEED.get("filter_exclude", "^.*$"), ) logger.info(feed + ":item:using filter_exclude:" + regexpat + " on " + item["title"] + " field " + filter_field) regexmatch = re.search( regexpat, process_field(filter_field, item, FEED, channel), ) if regexmatch is None: include = True logger.info( feed + ":item:passed exclude filter for " + channel["name"]) else: include = False logger.info( feed + ":item:failed exclude filter for " + channel["name"]) else: include = True # redundant safety net logger.info(feed + ":item:no filter configured for" + channel["name"]) if include is True: logger.info(feed + ":item:building message for " + channel["name"]) message = build_message(FEED, item, channel) logger.info( feed + ":item:sending message (eventually) to " + channel["name"]) await send_message_wrapper( asyncioloop, FEED, feed, channel, client, message) else: logger.info( feed + ":item:skipping item due to not passing filter for " + channel["name"]) else: # Logs of debugging info for date handling stuff... logger.info("%s:too old, skipping", feed) logger.debug("%s:now:now:%s", feed, time.time()) logger.debug("%s:now:gmtime:%s", feed, time.gmtime()) logger.debug("%s:now:localtime:%s", feed, time.localtime()) logger.debug("%s:pubDate:%r", feed, pubdate) logger.debug(item) # seen before, move on: else: logger.info(feed + ":item:" + id + " seen before, skipping") # This is completely expected behavior for a well-behaved feed: except HTTPNotModified: logger.info( feed + ":Headers indicate feed unchanged since last time fetched:") logger.debug(sys.exc_info()) # Many feeds have random periodic problems that shouldn't cause # permanent death: except HTTPError: logger.warn(feed + ":Unexpected HTTP error:") logger.warn(sys.exc_info()) logger.warn(feed + ":Assuming error is transient and trying again later") # sqlite3 errors are probably really bad and we should just totally # give up on life except sqlite3.Error as sqlerr: logger.error(feed + ":sqlite error: ") logger.error(sys.exc_info()) logger.error(sqlerr) raise # Ideally we'd remove the specific channel or something... # But I guess just throw an error into the log and try again later... except discord.errors.Forbidden: logger.error(feed + ":discord.errors.Forbidden") logger.error(sys.exc_info()) logger.error( feed + ":Perhaps bot isn't allowed in one of the channels for this feed?" ) # raise # or not? hmm... # unknown error: definitely give up and die and move on except Exception: logger.exception("Unexpected error - giving up") raise # No matter what goes wrong, wait same time and try again finally: logger.info(feed + ":sleeping for " + str(rss_refresh_time) + " seconds") await asyncio.sleep(rss_refresh_time)
async def background_check_feed(conn, feed, async_loop): """ The main work loop One of these is run for each feed. It's an asyncio thing. "await" (sleep or I/O) returns to main loop and gives other feeds a chance to run. """ logger.info(f'{feed}: Starting up background_check_feed') # Try to wait until Discord client has connected, etc: await client.wait_until_ready() # make sure debug output has this check run in the right order... await asyncio.sleep(1) user_agent = config["MAIN"].get("user_agent", USER_AGENT) # just a bit easier to use... _feed = config[feed] # pull config for this feed out: feed_url = _feed.get('feed_url') rss_refresh_time = _feed.getint('rss_refresh_time', 3600) start_skew = _feed.getint('start_skew', rss_refresh_time) start_skew_min = _feed.getint('start_skew_min', 1) max_age = _feed.getint('max_age', 86400) # loop through all the channels this feed is configured to send to channels = [] for key in _feed.get('channels').split(','): logger.debug(feed + ': adding channel ' + key) # stick a dict in the channels array so we have more to work with channels.append({ 'channel_obj': client.get_channel(int(config['CHANNELS'][key])), 'name': key, 'id': int(config['CHANNELS'][key]), }) if start_skew > 0: sleep_time = random.uniform(start_skew_min, start_skew) logger.info(f'{feed}:start_skew:sleeping for {str(sleep_time)}') await asyncio.sleep(sleep_time) # Basically run forever while not client.is_closed(): # And tries to catch all the exceptions and just keep going # (but see list of except/finally stuff below) try: logger.info(f'{feed}: processing feed') http_headers = {"User-Agent": user_agent} # Download the actual feed, if changed since last fetch # pull data about history of this *feed* from DB: cursor = conn.cursor() cursor.execute( "select lastmodified,etag from feed_info where feed=? OR url=?", [feed, feed_url]) data = cursor.fetchone() # If we've handled this feed before, # and we have etag from last run, add etag to headers. # and if we have a last modified time from last run, # add "If-Modified-Since" to headers. if data is None: # never handled this feed before... logger.info(f"{feed}:looks like updated version. saving info") cursor.execute( "REPLACE INTO feed_info (feed,url) VALUES (?,?)", [feed, feed_url]) conn.commit() logger.debug(f"{feed}:feed info saved") else: logger.debug( f"{feed}:setting up extra headers for HTTP request.") logger.debug(data) lastmodified = data[0] etag = data[1] if lastmodified is not None and len(lastmodified): logger.debug( f"{feed}:adding header If-Modified-Since: {lastmodified}" ) http_headers['If-Modified-Since'] = lastmodified else: logger.debug(f"{feed}:no stored lastmodified") if etag is not None and len(etag): logger.debug(f"{feed}:adding header ETag: {etag}") http_headers['ETag'] = etag else: logger.debug(f"{feed}:no stored ETag") logger.debug(f"{feed}:sending http request for {feed_url}") feed_data = None # Send actual request. async with aiohttp.ClientSession() as sess: async with sess.get(feed_url, headers=http_headers) as http_response: logger.debug(http_response) # First check that we didn't get a "None" response, since that's # some sort of internal error thing: if http_response.status is None: logger.error(f"{feed}:HTTP response code is NONE") http_response.close() # raise not HTTPError because this is giving me NoneType errors raise HTTPForbidden() # Some feeds are smart enough to use that if-modified-since or # etag info, which gives us a 304 status. If that happens, # assume no new items, fall through rest of this and try again # later. elif http_response.status == 304: logger.debug(f"{feed}:data is old; moving on") http_response.close() raise HTTPNotModified() # If we get anything but a 200, that's a problem and we don't # have good data, so give up and try later. # Mostly handled different than 304/not-modified to make logging # clearer. elif http_response.status != 200: logger.debug(f"{feed}:HTTP error not 200") http_response.close() # raise not HTTPError because this is giving me NoneType errors raise HTTPForbidden() else: logger.debug(f"{feed}:HTTP success") # pull data out of the http response logger.debug(f"{feed}:reading http response") http_data = await http_response.read() # parse the data from the http response with feedparser logger.debug(f"{feed}:parsing http data") feed_data = feedparser.parse(http_data) logger.debug(f"{feed}:done fetching") # If we got an ETAG back in headers, store that, so we can # include on next fetch if 'ETAG' in http_response.headers: etag = http_response.headers['ETAG'] logger.debug(f"{feed}:saving etag: {etag}") cursor.execute( "UPDATE feed_info SET etag=? where feed=? or url=?", [etag, feed, feed_url]) conn.commit() logger.debug(f"{feed}:etag saved") else: logger.debug(f"{feed}:no etag") # If we got a Last-Modified header back, store that, so we can # include on next fetch if 'LAST-MODIFIED' in http_response.headers: modified = http_response.headers['LAST-MODIFIED'] logger.debug(f"{feed}:saving lastmodified: {modified}") cursor.execute( "UPDATE feed_info SET lastmodified=? where feed=? or url=?", [modified, feed, feed_url]) conn.commit() logger.debug(f"{feed}:saved lastmodified") else: logger.debug(f"{feed}:no last modified date") # Process all of the entries in the feed # Use reversed to start with end, which is usually oldest logger.debug(f"{feed}:processing entries") if feed_data is None: logger.error(f"{feed}:no data in feed_data") # raise not HTTPError because this is giving me NoneType errors raise HTTPForbidden() for item in reversed(feed_data.entries): logger.debug(f"{feed}:item:processing this entry:{item}") # Pull out the unique id, or just give up on this item. if 'id' in item: uid = item.id elif 'guid' in item: uid = item.guid elif 'link' in item: uid = item.link else: logger.error(f"{feed}:item:no id, skipping") continue # Get our best date out, in both raw and parsed form pubdate = extract_best_item_date(item, TIMEZONE) pubdate_fmt = pubdate.strftime("%a %b %d %H:%M:%S %Z %Y") logger.debug(f"{feed}:item:id:{uid}") logger.debug( f"{feed}:item:checking database history for this item") # Check DB for this item cursor.execute( "SELECT published,title,url,reposted FROM feed_items WHERE id=?", [uid]) data = cursor.fetchone() # If we've never seen it before, then actually processing # this: if data is None: logger.info(f"{feed}:item {uid} unseen, processing:") # Store info about this item, so next time we skip it: cursor.execute( "INSERT INTO feed_items (id,published) VALUES (?,?)", [uid, pubdate_fmt]) conn.commit() # Doing some crazy date math stuff... # max_age is mostly so that first run doesn't spew too # much stuff into a room, but is also a useful safety # measure in case a feed suddenly reverts to something # ancient or other weird problems... time_since_published = TIMEZONE.localize( datetime.now()) - pubdate.astimezone(TIMEZONE) if time_since_published.total_seconds() < max_age: logger.info(f"{feed}:item:fresh and ready for parsing") # Loop over all channels for this particular feed # and process appropriately: for channel in channels: # just a bit easier to use... _name = channel['name'] include = True filter_field = _feed.get( f"{_name}.filter", _feed.get('filter_field', 'title')) # Regex if channel exists if f"{_name}.filter" in _feed or 'filter' in _feed: logger.debug( f"{feed}:item:running filter for {_name}") re_pat = _feed.get(f"{_name}.filter", _feed.get('filter', '^.*$')) logger.debug( f"{feed}:item:using filter: {re_pat} on " f"{item['title']} field {filter_field}") re_match = re.search(re_pat, item[filter_field]) if re_match is None: include = False logger.info( f"{feed}:item:failed filter for {_name}" ) elif f"{_name}.filter_exclude" in _feed or 'filter_exclude' in _feed: logger.debug( f"{feed}:item:running exclude filter for{_name}" ) re_pat = _feed.get( f"{_name}.filter_exclude", _feed.get('filter_exclude', '^.*$')) logger.debug( f"{feed}:item:using filter_exclude: {re_pat} on " f"{item['title']} field {filter_field}") re_match = re.search(re_pat, item[filter_field]) if re_match is None: include = True logger.info( f"{feed}:item:passed exclude filter for {_name}" ) else: include = False logger.info( f"{feed}:item:failed exclude filter for {_name}" ) else: include = True # redundant safety net logger.debug( f"{feed}:item:no filter configured for {_name}" ) if include is True: logger.debug( f"{feed}:item:building message for {_name}" ) message = build_message(_feed, item, channel) logger.debug( f"{feed}:item:sending message (eventually) to {_name}" ) await send_message_wrapper( async_loop, feed, channel, message) else: logger.info( f"{feed}:item:skipping item due to not passing filter for {_name}" ) else: # Logs of debugging info for date handling stuff... logger.info(f"{feed}:too old, skipping") logger.debug(f"{feed}:now:now:{time.time()}") logger.debug(f"{feed}:now:gmtime:{time.gmtime()}") logger.debug( f"{feed}:now:localtime:{time.localtime()}") logger.debug(f"{feed}:pubDate:{pubdate}") logger.debug(item) # seen before, move on: else: logger.debug(f"{feed}:item: {uid} seen before, skipping") # This is completely expected behavior for a well-behaved feed: except HTTPNotModified: logger.debug( f"{datetime.today()}:{feed}: Headers indicate feed unchanged since last time fetched:" ) logger.debug(sys.exc_info()) # Many feeds have random periodic problems that shouldn't cause # permanent death: except HTTPForbidden: logger.warning(f"{datetime.today()}:{feed}: Unexpected HTTPError:") logger.warning(sys.exc_info()) logger.warning( f"{datetime.today()}:{feed}: Assuming error is transient and trying again later" ) # sqlite3 errors are probably really bad and we should just totally # give up on life except sqlite3.Error as sqlerr: logger.error(f"{datetime.today()}:{feed}: sqlite3 error: ") logger.error(sys.exc_info()) logger.error(sqlerr) raise # Ideally we'd remove the specific channel or something... # But I guess just throw an error into the log and try again later... except discord.errors.Forbidden: logger.error( f"{datetime.today()}:{feed}: discord.errors.Forbidden") logger.error(sys.exc_info()) logger.error( f"{datetime.today()}:{feed}: Perhaps bot isn't allowed in one of the channels for this feed?" ) # raise # or not? hmm... except asyncio.TimeoutError: logger.error(f"{datetime.today()}:{feed}: Timeout error") except aiohttp.ClientConnectorError: logger.error(f"{datetime.today()}:{feed}: Connection failed!") except aiohttp.ClientOSError: logger.error( f"{datetime.today()}:{feed}: Connection not responding!") except aiohttp.ServerDisconnectedError: logger.error(f"{datetime.today()}:{feed}: Socket closed by peer") # unknown error: definitely give up and die and move on except BaseException: logger.exception( f"{datetime.today()}:{feed}: Unexpected error - giving up") # raise # or not? hmm... # No matter what goes wrong, wait same time and try again finally: logger.debug( f"{feed}:sleeping for {str(rss_refresh_time)} seconds") await asyncio.sleep(rss_refresh_time)