def dispatch(self, msg):
		match = self.regex.search(msg)
		if not match:
			log.debug('Failed to match snort rule-sid in msg: {!r}'.format(msg))
			return msg
		sid = match.group('sid')

		if self.gid_ignore:
			try: gid = match.group('gid')
			except IndexError: pass
			else:
				if gid in self.gid_ignore: return msg

		ts = time()
		if self.sid_db_ts < ts - self.conf.sid_db_mtime_check_interval:
			if not os.path.exists(self.conf.paths.sid_db)\
					or max(0, *( os.stat(p).st_mtime
						for p in [self.conf.paths.sid_src, self.conf.paths.refs]
						if os.path.exists(p) )) > os.stat(self.conf.paths.sid_db).st_mtime:
				self.update_sid_db()
			self.sid_db = anydbm.open(self.conf.paths.sid_db)

		try: ref = force_unicode(self.sid_db[force_bytes(sid)])
		except KeyError:
			log.info('Failed to find refs for sid: {!r} (msg: {!r})'.format(sid, msg))
		else: msg += u'\n  refs: {}'.format(ref)
		return msg
	def shorten(self, url):
		url = force_bytes(url)
		if len(url) >= self.conf.length_min:
			try: func = getattr(self, 'shorten_{}'.format(self.conf.api.type))
			except AttributeError:
				raise ValueError('URL shortener "{}" is not supported')
			url = yield defer.maybeDeferred(func, url, self.conf.api.parameters)
		defer.returnValue(force_unicode(re.sub(r'^(?i)(https?|spdy)://', '', url)))
	def fetch_feed(self, url):
		feed_type = self.feeds[url].type

		err = None
		try: data = yield self.client.request(url)
		except HTTPClientError as err:
			log.warn('Failed to fetch feed ({}): {}'.format(url, err))
			data = None
		finally: self.schedule_fetch(url, fast=bool(err)) # do faster re-fetch on errors

		if data is None: defer.returnValue(None) # cache hit, not modified, error
		data, headers = data

		if feed_type == 'feed':
			import feedparser
			parser = feedparser.parse(data, response_headers=headers)
			feed, posts = parser.feed, parser.entries
		elif feed_type == 'reddit-json':
			from lya import AttrDict # mandatory dep anyway
			data = json.loads(data)['data']
			posts = list(AttrDict(post['data']) for post in data.pop('children'))
			feed = AttrDict(data)
		else:
			raise ValueError('Unrecognized feed type: {!r}'.format(self.feeds[url].type))

		count = 0
		for post in reversed(posts):
			if feed_type == 'reddit-json':
				# Some reddit-api-specific encoding hacks
				try: title = unescape(post['title'])
				except KeyError: pass
				else: post.title = title

			post_obj = FeedEntryInfo(feed, post, self.conf)

			post_id = list(
				force_bytes(post_obj.get_by_path(attr))
				for attr in self.feeds[url].deduplication )
			if not self.filter_db.add(url, post_id): continue

			first_err = None
			for template in self.feeds[url].template:
				try: event = template.format(**post_obj._asdict())
				except (KeyError, IndexError, AttributeError) as err:
					if not first_err:
						first_err = ValueError(
							'Failed to format template {!r} (data: {}): {}'\
							.format(template, post_obj, err) )
					continue
				event = RelayedEvent(event)
				event.data = post_obj # for any further tricky filtering
				reactor.callLater(0, self.interface.dispatch, event, source=self)
				break
			else: raise first_err # all templates failed

			count += 1
			if self.feeds[url].process_max and count >= self.feeds[url].process_max: break
	def name_from_patch_link( self, link,
			_re_path=re.compile(r'\bpackages/[\w\-]+/(?P<name>[\w\-]+)/') ):
		names = set()
		try: page = yield getPage(force_bytes(link), timeout=120)
		except Exception as err:
			log.warn('Failed to download patch: {}'.format(err))
			defer.returnValue(None)
		page = it.imap(op.methodcaller('strip'), page.splitlines())
		for line in page:
			if re.search(r'^\s*(\S+\s+\|\s+\d+\s+[\-+]*\s*$|rename |diff --git |[\-+]{3} )', line):
				line = _re_path.search(line)
				if line: names.add(line.group('name'))
		defer.returnValue(names)
	def hash(self, val):
		if not isinstance(val, types.StringTypes): val = '\0'.join(val)
		val = force_bytes(val)
		return hashlib.sha256(val).digest()
Beispiel #6
0
	def request(self, url, method='get', decode=None, encode=None, data=None):
		method, url = force_bytes(method).upper(), force_bytes(url)
		headers = {'User-Agent': self.user_agent}

		if method == 'GET' and self.use_cache_headers:
			# Avoid doing extra work
			cache = self.fetch_cache.get(url, dict())
			if 'cache-control' in cache and cache['cache-control'] >= time.time():
				defer.returnValue(None) # no need to re-process same thing
			if 'last-modified' in cache:
				headers['If-Modified-Since'] = rfc822date(cache['last-modified'])
			if 'etag' in cache: headers['If-None-Match'] = '"{}"'.format(cache['etag'])

		log.noise( 'HTTP request: {} {} (h: {}, enc: {}, dec: {}, data: {!r})'\
			.format(method, url[:100], headers, encode, decode, type(data)) )

		if data is not None:
			if encode is None:
				if isinstance(data, types.StringTypes): data = io.BytesIO(data)
			elif encode == 'form':
				headers.setdefault('Content-Type', 'application/x-www-form-urlencoded')
				data = io.BytesIO(urlencode(data))
			elif encode == 'json':
				headers.setdefault('Content-Type', 'application/json')
				data = io.BytesIO(json.dumps(data))
			else: raise ValueError('Unknown request encoding: {}'.format(encode))
			data_raw, data = data, FileBodyProducer(data)
		else: data_raw = None
		if decode not in ['json', None]:
			raise ValueError('Unknown response decoding method: {}'.format(decode))

		requests = None # indicates fallback to requests module (for e.g. ipv6-only site)
		err = None
		try:
			res = yield self.request_agent.request( method, url,
				Headers(dict((k,[v]) for k,v in (headers or dict()).viewitems())), data )
		except error.DNSLookupError:
			import requests, socket
			try:
				res = yield self.sync_wrap(
					getattr(requests, method.lower()), url, headers=headers, data=data_raw )
			except ( socket.error, SyncTimeout,
				requests.exceptions.RequestException ) as err: pass
		except ( RequestTransmissionFailed,
			RequestNotSent, ResponseFailed ) as err: pass

		if err:
			if not self.hide_connection_errors:
				raise HTTPClientError(None, 'Lookup/connection error: {}'.format(err))
			else:
				log.debug('Lookup/connection error (supressed): {}'.format(err))
				defer.returnValue(None) # should also supress fast refetching

		code, phrase, version = (res.code, res.phrase, res.version)\
			if not requests else ( res.status_code,
				http.RESPONSES[res.status_code], ('HTTP', 1, 1) )
		log.noise( 'HTTP request done ({} {}): {} {} {}'\
			.format(method, url[:100], code, phrase, version) )
		if code in [http.NO_CONTENT, http.NOT_MODIFIED]: defer.returnValue(None)
		if code not in [http.OK, http.CREATED]: raise HTTPClientError(code, phrase)

		if not requests:
			data = defer.Deferred()
			res.deliverBody(DataReceiver(data))
			data = yield data
			headers = dict((k, v[-1]) for k,v in res.headers.getAllRawHeaders())
		else:
			try:
				data = yield self.sync_wrap(getattr, res, 'text')
				headers = yield self.sync_wrap(getattr, res, 'headers')
			except (requests.exceptions.RequestException, SyncTimeout) as err:
				raise HTTPClientError(None, 'Sync connection error: {}'.format(err))

		if method == 'GET' and self.use_cache_headers:
			cache = dict((k.lower(), v) for k,v in headers.items())
			cache = dict( (k, cache[k]) for k in
				['last-modified', 'cache-control', 'etag'] if k in cache )
			# Update headers' cache
			if 'last-modified' in cache:
				ts = rfc822.parsedate_tz(cache['last-modified'])
				cache['last-modified'] = time.mktime(ts[:9]) + (ts[9] or 0)
			if 'cache-control' in cache:
				match = re.search(r'\bmax-age=(\d+)\b', cache.pop('cache-control'))
				if match: cache['cache-control'] = time.time() + int(match.group(1))
			if cache: self.fetch_cache[url] = cache

		defer.returnValue((json.loads(data) if decode is not None else data, headers))