def fetch_feed(self, url): feed_type = self.feeds[url].type err = None try: data = yield self.client.request(url) except HTTPClientError as err: log.warn('Failed to fetch feed ({}): {}'.format(url, err)) data = None finally: self.schedule_fetch(url, fast=bool(err)) # do faster re-fetch on errors if data is None: defer.returnValue(None) # cache hit, not modified, error data, headers = data if feed_type == 'feed': import feedparser parser = feedparser.parse(data, response_headers=headers) feed, posts = parser.feed, parser.entries elif feed_type == 'reddit-json': from lya import AttrDict # mandatory dep anyway data = json.loads(data)['data'] posts = list(AttrDict(post['data']) for post in data.pop('children')) feed = AttrDict(data) else: raise ValueError('Unrecognized feed type: {!r}'.format(self.feeds[url].type)) count = 0 for post in reversed(posts): if feed_type == 'reddit-json': # Some reddit-api-specific encoding hacks try: title = unescape(post['title']) except KeyError: pass else: post.title = title post_obj = FeedEntryInfo(feed, post, self.conf) post_id = list( force_bytes(post_obj.get_by_path(attr)) for attr in self.feeds[url].deduplication ) if not self.filter_db.add(url, post_id): continue first_err = None for template in self.feeds[url].template: try: event = template.format(**post_obj._asdict()) except (KeyError, IndexError, AttributeError) as err: if not first_err: first_err = ValueError( 'Failed to format template {!r} (data: {}): {}'\ .format(template, post_obj, err) ) continue event = RelayedEvent(event) event.data = post_obj # for any further tricky filtering reactor.callLater(0, self.interface.dispatch, event, source=self) break else: raise first_err # all templates failed count += 1 if self.feeds[url].process_max and count >= self.feeds[url].process_max: break
def dispatch(self, msg): msg_etree = self.lxml_soup(msg) if not self.conf.process_links or self.conf.process_links.enabled: for tag in msg_etree.iter(tag='a'): try: assert tag.text if self.conf.process_links.detect_hashtags: parent = tag.getparent() assert not parent.tag == 'span' and parent.get('class') == 'tag' if self.conf.process_links.detect_inlined_urls: link = tag.get('href') assert not op.eq(*it.imap(ft.partial(re.sub, ur'^https?://', ''), [link, tag.text])) except AssertionError: pass else: tag.text = u'{} <{}>'.format(tag.text, tag.attrib['href']) tag.drop_tag() msg_new = msg_etree.text_content() if isinstance(msg, RelayedEvent) and hasattr(msg, 'data'): msg_new = RelayedEvent(msg_new) msg_new.data = msg.data return msg_new
def handle_line(self, line, path): log.noise('New line: {!r}'.format(line)) event = RelayedEvent(force_unicode(line)) event.data = AttrDict(path=path.path) reactor.callLater(0, self.interface.dispatch, event, source=self)
def dispatch(self, msg): if not msg.strip(): return ## Event lines are cached until EOE msg is encountered match = self._re_base.search(msg) if not match: log.warn('Failed to match audit event spec: {!r}'.format(msg)) return node, ev_id, ev_type, msg = (match.group(k) for k in ['node', 'ev_id', 'type', 'msg']) ev_key = node, ev_id if ev_key not in self._ev_cache: self._ev_cache[ev_key] = defaultdict(list) self._ev_cache[ev_key].update(ts=time.time(), node=node, ev_id=ev_id) self._ev_cache_gc() ev = self._ev_cache[ev_key] if ev_type != 'EOE': # cache event data ev[ev_type].append(msg) return del self._ev_cache[ev_key] ## Get "key" value for event, if present ev_key = None try: syscall, = ev['SYSCALL'] # currently handled events always have it except ValueError: pass else: try: ev_key = self.get_msg_val(syscall, 'key', ur'"(?P<val>[^"]+)"') except KeyError as err: log.noise('Failed to get ev_key from syscall: {}'.format(err)) if not ev_key: log.noise('Unhandled event: {!r}'.format(ev)) return ## Processing if ev_key in self.conf.events.watches.ev_keys: # Extract all necessary attributes ev_vals = dict(node=ev['node'], ev_id=ev['ev_id'], key=ev_key) for k in it.imap(''.join, it.product(['', 'e', 's', 'fs'], ['uid', 'gid'])): ev_vals[k] = self.get_msg_val(syscall, k) for k in 'comm', 'exe': ev_vals[k] = self.get_msg_val(syscall, k, ur'"(?P<val>[^"]+)"') ev_vals['tty'] = self.get_msg_val(syscall, 'tty', '(?P<val>\S+)') paths = ev_vals['paths'] = list() for msg in ev['PATH']: path = self.get_msg_val(msg, 'name', ur'(?P<val>"[^"]+"|\(null\)|[0-9A-F]+)') paths.append(dict( path=path, inode=self.get_msg_val(msg, 'inode', fallback='nil'), dev=self.get_msg_val(msg, 'dev', '(?P<val>[a-f\d]{2}:[a-f\d]{2})', fallback='nil') )) # Formatting err, tpl = None, force_unicode(self.conf.events.watches.template_path) ev_vals['paths'] = list() for val in paths: try: ev_vals['paths'].append(tpl.format(**val)) except self._lookup_error as err: break if not err: ev_vals['paths'] = ', '.join(ev_vals['paths']) tpl, val = force_unicode(self.conf.events.watches.template), ev_vals try: event = tpl.format(**val) except self._lookup_error as err: pass event = RelayedEvent(event) event.data = ev_vals return event raise ValueError( 'Failed to format template {!r} (data: {}): {}'.format(tpl, val, err))