def _do_find(url, max_trys, printer, rss_proxy_url, rss_proxy_token): def message_handler(msg): print(msg) finder = FeedFinder( url, max_trys=max_trys, rss_proxy_url=rss_proxy_url, rss_proxy_token=rss_proxy_token, message_handler=message_handler, ) with finder: found = finder.find() if found: response, raw_result = found printer('-> {}'.format(response)) result = FeedParser().parse(raw_result) printer("-> {}".format(result)) printer('-' * 79) printer(pretty_format_json(result.feed)) for i, story in enumerate(result.storys): printer('{:03d}{}'.format(i, '-' * 76)) story['content'] = shorten(story['content'], 60) story['summary'] = shorten(story['summary'], 60) printer(pretty_format_json(story))
def write(self, response: FeedResponse): content_length = 0 if response.content: content_length = len(response.content) feed_type = response.feed_type.value if response.feed_type else None filename = None if response.content: file_ext = self._get_file_ext(response) filename = os.path.basename(self._filename) + file_ext meta = dict( filename=filename, url=response.url, status=response.status, content_length=content_length, encoding=response.encoding, feed_type=feed_type, mime_type=response.mime_type, use_proxy=response.use_proxy, etag=response.etag, last_modified=response.last_modified, ) os.makedirs(self._output_dir, exist_ok=True) with open(self._meta_filepath, 'w') as f: f.write(pretty_format_json(meta)) if filename: filepath = _normalize_path(os.path.join(self._output_dir, filename)) with open(filepath, 'wb') as f: f.write(response.content)
def main(verify, filepath, verify_bias): if verify: if verify != '-': with open(verify) as f: data = json.load(f) result = [(x['name'], x['count']) for x in data['tables']] else: result = pg_count() if filepath and filepath != '-': with open(filepath) as f: content = f.read() else: content = sys.stdin.read() expect_data = json.loads(content) expect_result = [(x['name'], x['count']) for x in expect_data['tables']] is_ok = pg_verify(result, expect_result, verify_bias) sys.exit(0 if is_ok else 1) else: result = pg_count() tables = [dict(name=name, count=count) for name, count in result] content = pretty_format_json(dict(tables=tables)) if filepath and filepath != '-': with open(filepath, 'w') as f: f.write(content) else: print(content)
def _do_parse( url: str, printer, checksum, save_checksum, proxy_url, rss_proxy_url, rss_proxy_token, ): if not url.startswith('http://') and not url.startswith('https://'): response_file = FeedResponseFile(url) response = response_file.read() else: reader = FeedReader( proxy_url=proxy_url, rss_proxy_url=rss_proxy_url, rss_proxy_token=rss_proxy_token, ) with reader: response = reader.read(url, use_proxy=reader.has_proxy) print('-> {}'.format(response)) if not response.ok: return if checksum: with open(checksum, 'rb') as f: data = f.read() checksum = FeedChecksum.load(data) print('-> {}'.format(checksum)) else: checksum = None raw_result = RawFeedParser().parse(response) if raw_result.warnings: print('Warning: ' + '; '.join(raw_result.warnings)) result = FeedParser(checksum=checksum).parse(raw_result) print("-> {}".format(result)) printer('-' * 79) printer(pretty_format_json(result.feed)) for i, story in enumerate(result.storys): printer('{:03d}{}'.format(i, '-' * 76)) story['content'] = shorten(story['content'], 60) story['summary'] = shorten(story['summary'], 60) printer(pretty_format_json(story)) if save_checksum: print('-> save {}'.format(save_checksum)) data = result.checksum.dump() with open(save_checksum, 'wb') as f: f.write(data)
def delete_invalid_feeds(days=1, limit=100, threshold=99): sql = """ SELECT feed_id, title, link, url, status_code, count FROM ( SELECT feed_id, status_code, count(1) as count FROM rssant_api_rawfeed WHERE dt_created >= %s and (status_code < 200 or status_code >= 400) group by feed_id, status_code having count(1) > 3 order by count desc limit %s ) error_feed join rssant_api_feed on error_feed.feed_id = rssant_api_feed.id order by feed_id, status_code, count; """ sql_ok_count = """ SELECT feed_id, count(1) as count FROM rssant_api_rawfeed WHERE dt_created >= %s and (status_code >= 200 and status_code < 400) AND feed_id=ANY(%s) group by feed_id """ t_begin = timezone.now() - timezone.timedelta(days=days) error_feeds = defaultdict(dict) with connection.cursor() as cursor: cursor.execute(sql, [t_begin, limit]) for feed_id, title, link, url, status_code, count in cursor.fetchall(): error_feeds[feed_id].update(feed_id=feed_id, title=title, link=link, url=url) error = error_feeds[feed_id].setdefault('error', {}) error_name = FeedResponseStatus.name_of(status_code) error[error_name] = count error_feeds[feed_id]['error_count'] = sum(error.values()) error_feeds[feed_id].update(ok_count=0, error_percent=100) cursor.execute(sql_ok_count, [t_begin, list(error_feeds)]) for feed_id, ok_count in cursor.fetchall(): feed = error_feeds[feed_id] total = feed['error_count'] + ok_count error_percent = round((feed['error_count'] / total) * 100) feed.update(ok_count=ok_count, error_percent=error_percent) error_feeds = list( sorted(error_feeds.values(), key=lambda x: x['error_percent'], reverse=True)) delete_feed_ids = [] for feed in error_feeds: if feed['error_percent'] >= threshold: delete_feed_ids.append(feed['feed_id']) click.echo(pretty_format_json(feed)) if delete_feed_ids: confirm_delete = click.confirm(f'Delete {len(delete_feed_ids)} feeds?') if not confirm_delete: click.echo('Abort!') else: UnionFeed.bulk_delete(delete_feed_ids) click.echo('Done!') return error_feeds
async def do_register(ctx: ActorContext, node: NodeSpecSchema) -> T.dict(nodes=T.list(NodeSpecSchema)): LOG.info(f'register node:\n{pretty_format_json(node)}') existed = ctx.registery.get(node['name']) if existed and existed.to_spec() == node: LOG.info(f'register node {node["name"]} already existed and no changes') else: ctx.registery.add(node) LOG.info('current registery info:\n' + pretty_format_json(ctx.registery.to_spec())) await ctx.tell('scheduler.save_registery') return dict(nodes=ctx.registery.to_spec())
def do_load_registery(ctx: ActorContext): registery_node = ctx.registery.registery_node.name LOG.info(f'load registery info for {registery_node}') registery = Registery.get(registery_node) if registery: ctx.registery.update(registery.node_specs) title = 'loaded' else: title = 'current' LOG.info(f'{title} registery info:\n' + pretty_format_json(ctx.registery.to_spec())) ctx.tell('scheduler.boardcast_registery')
def on_startup(app): while True: try: r = app.ask('scheduler.register', dict(node=app.registery.current_node.to_spec())) except Exception as ex: LOG.warning(f'ask scheduler.register failed: {ex}') time.sleep(3) else: app.registery.update(r['nodes']) break nodes = pretty_format_json(app.registery.to_spec()) LOG.info('current registery:\n' + nodes)
def on_startup(app): while True: try: r = app.ask('scheduler.register', dict(node=app.registery.current_node.to_spec())) except Exception as ex: LOG.warning(f'ask scheduler.register failed: {ex}') time.sleep(3) else: app.registery.update(r['nodes']) break nodes = pretty_format_json(app.registery.to_spec()) LOG.info(f'current registery:\n' + nodes) if app.kong_client: LOG.info(f'kong register {app.name} url={app.kong_actor_url}') while True: try: app.kong_client.register(app.name, app.kong_actor_url) except Exception as ex: LOG.warning(f'kong register failed: {ex}') time.sleep(3) else: break
def do_update_registery(ctx, nodes: T.list(NodeSpecSchema)): LOG.info(f'update registery {ctx.message}') ctx.registery.update(nodes) nodes = pretty_format_json(ctx.registery.to_spec()) LOG.info(f'current registery:\n' + nodes)
def do_load_registery(ctx: ActorContext): registery_node = ctx.registery.registery_node.name registery_info = pretty_format_json(ctx.registery.to_spec()) LOG.info(f'load registery info for {registery_node}:\n' + registery_info) ctx.tell('scheduler.boardcast_registery')
def print_health(self): print(pretty_format_json(self.health()))