def fetch(self, url: str) -> str: html = self._storage.get(url) if html is not None: return html if self.browser is not None: html = self.browser.get(url) else: req = urllib.request.Request( url, None, { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Charset": "utf-8;q=0.7,*;q=0.3", "Accept-Language": "en-US,en;q=0.8", "Connection": "keep-alive", }, ) cj = CookieJar() opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cj)) response = opener.open(req) content = response.read() charset = cchardet.detect(content) html = content.decode(charset["encoding"] or "utf-8") logger.info(Fore.GREEN, "Sent", f"{url} {len(html)}") self._storage[url] = html logger.info(Fore.BLUE, "Storage", f"Set<{url}>") return html
def register(self, item): """Register items""" if item in self.item_classes: logger.error('Register', 'Repeat register item <%s>' % (item.__name__)) exit() self.item_classes.append(item) item.__base_url__ = item.__base_url__ or self.base_url for define_alias, define_route in OrderedDict(item.Meta.route).items(): alias = '^' + define_alias.replace('?', '\?') + '$' _alias_re = re.compile( re.sub( ':(?P<params>[a-z_]+)', lambda m: '(?P<{}>[A-Za-z0-9_?&/=\s\-\u4e00-\u9fa5]+)'. format(m.group('params')), alias)) self.alias_re.append((define_alias, _alias_re)) self.items[define_alias].append({ 'item': item, 'alias_re': _alias_re, 'alias': define_alias, 'route': item.__base_url__ + define_route }) logger.info(Fore.GREEN, 'Register', '<%s>' % (item.__name__)) item_with_ajax = getattr(item.Meta, 'web', {}).get('with_ajax', False) if self.browser is None and item_with_ajax: self.browser = self.get_browser(settings=self.settings, item_with_ajax=item_with_ajax)
def serve(self, ip='127.0.0.1', port=5000, **options): try: logger.info(Fore.WHITE, 'Serving', 'http://%s:%s' % (ip, port)) self.server.run(ip, port, **options) except Exception as e: logger.error('Serving', '%s' % str(e)) exit()
def wrapper(error=None, ttl=ttl, *args, **kwargs): if error: from flask import request parse_result = urlparse(request.url) if parse_result.query != '': key = '{}?{}'.format( parse_result.path, parse_result.query ) else: key = request.path else: # TODO key = None cache_key = key ttl = ttl or self.ttl try: if self.exists(cache_key): logger.info(Fore.YELLOW, 'Cache', 'Get<%s>' % cache_key) return jsonify(self.get(cache_key, **kwargs)) except Exception: logger.exception('Cache', 'Get<%s>' % cache_key) result = func(error, url=key, *args, **kwargs) if result and cache_key: try: if self.set(cache_key, result, ttl=ttl, **kwargs): logger.info(Fore.YELLOW, 'Cache', 'Set<%s>' % cache_key) except Exception: logger.exception('Cache', 'Set<%s>' % cache_key) return jsonify(result)
def set_cache(self, key, value): """Set cache""" if self.cache.get(key) is None and self.cache.set(key, value): logger.info(Fore.YELLOW, 'Cache', 'Set<%s>' % key) self.update_status('_status_cache_set') return True return False
def fetch_page_source(self, url, item, params=None, **kwargs): """Fetch the html of given url""" self.update_status('_status_sent') if getattr(item.Meta, 'web', {}).get( 'with_ajax', False) and self.browser is not None: self.browser.get(url) text = self.browser.page_source if text != '': logger.info(Fore.GREEN, 'Sent', '%s %s 200' % (url, len(text))) else: logger.error('Sent', '%s %s' % (url, len(text))) result = text else: request_config = getattr(item.Meta, 'web', {}).get( 'request_config', {}) or self.web.get('request_config', {}) response = requests.get(url, params=params, timeout=15, **request_config) content = response.content charset = cchardet.detect(content) text = content.decode(charset['encoding']) if response.status_code != 200: logger.error( 'Sent', '%s %s %s' % (url, len(text), response.status_code)) else: logger.info( Fore.GREEN, 'Sent', '%s %s %s' % (url, len(text), response.status_code)) result = text self.set_storage(url, result) return result
def page_not_found(error): start_time = time() path = request.full_path if path.endswith('?'): path = path[:-1] try: res = api.get_cache(path) if res is None: res = api.parse(path) api.set_cache(path, res) if res is None: logger.error('Received', '%s 404' % request.url) return 'Not Found', 404 api.update_status('_status_received') end_time = time() time_usage = end_time - start_time logger.info( Fore.GREEN, 'Received', '%s %s 200 %.2fms' % (request.url, len(res), time_usage * 1000)) return app.response_class(response=res, status=200, mimetype='application/json') except Exception as e: return str(e), 500
def run(self, host="127.0.0.1", port=5000, **options): try: logger.info(Fore.GREEN, "Serving", f"http://{host}:{port}") self.app.run(host, port, **options) except Exception as e: logger.error("Serving", "%s" % str(e)) logger.error("Serving", "%s" % str(traceback.format_exc())) exit()
def register(self, item): """Register items""" item.__base_url__ = item.__base_url__ or self.base_url logger.info(Fore.WHITE, 'Register', '<%s:%s>' % (item.Meta.route, item.__name__)) self.item_classes.append(item) item_with_ajax = getattr(item.Meta, 'web', {}).get('with_ajax', False) if self.browser is None and item_with_ajax: self.browser = self.get_browser(settings=self.settings, item_with_ajax=item_with_ajax)
def run(self, host='127.0.0.1', port=5000, **options): try: logger.info(Fore.GREEN, 'Serving', f'http://{host}:{port}') self.app.run(host, port, **options) except Exception as e: logger.error('Serving', '%s' % str(e)) logger.error('Serving', '%s' % str(traceback.format_exc())) exit()
def _parse_item(self, html, item): """Parse a single item from html""" result = {} result[item.name] = item.parse(html) logger.info( Fore.CYAN, 'Parsed', 'Item<%s[%s]>' % (item.name.title(), len(result[item.name]))) return result
def get_cache(self, key, default=None): """Set cache""" result = self.cache.get(key) if result is not None: logger.info(Fore.YELLOW, 'Cache', 'Get<%s>' % key) self.update_status('_status_cache_get') return result return default
def get_storage(self, key, default=None): """Set storage""" result = self.storage.get(key) if result is not None: logger.info(Fore.BLUE, 'Storage', 'Get<%s>' % key) self.update_status('_status_storage_get') return result return default
def parse_item(self, html, item): """Parse item from html""" result = {} result[item.__name__] = item.parse(html) if len(result[item.__name__]) == 0: logger.error('Parsed', 'Item<%s[%s]>' % (item.__name__.title(), len(result[item.__name__]))) else: logger.info(Fore.CYAN, 'Parsed', 'Item<%s[%s]>' % (item.__name__.title(), len(result[item.__name__]))) return result
def fn(item): self._routes.append([source_format, target_format, item]) logger.info( Fore.GREEN, "Register", f"<{item.__name__}: {source_format} {target_format}>", ) return item
def _fetch_page_source(self, url, params=None, **kwargs): """Fetch the html of given url""" if self.with_ajax: self._browser.get(url) text = self._browser.page_source else: response = requests.get(url, params=params, **kwargs) text = response.text logger.info(Fore.GREEN, 'Sent', '%s %s' % (url, len(text))) return text
def set_storage(self, key, value): """Set storage""" try: if self.storage.get(key) is None and self.storage.save(key, value): logger.info(Fore.BLUE, 'Storage', 'Set<%s>' % key) self.update_status('_status_storage_set') return True return False except Exception as e: logger.error('Storage', 'Set<{}>'.format(str(e))) return False
def page_not_found(error): parse_result = urlparse(request.url) if parse_result.query != '': url = '{}?{}'.format(parse_result.path, parse_result.query) else: url = request.path try: res = jsonify(self.parse(url)) logger.info(Fore.GREEN, 'Received', '%s %s' % (request.url, len(res.response[0]))) return res except Exception as e: return str(e)
def handler(path): try: start_time = time() full_path = request.full_path.strip('?') results = self.parse_url(full_path) end_time = time() time_usage = end_time - start_time res = jsonify(results) logger.info( Fore.GREEN, 'Received', '%s %s 200 %.2fms' % (request.url, len(res.response), time_usage * 1000)) return res except Exception as e: logger.error('Serving', f'{e}') logger.error('Serving', '%s' % str(traceback.format_exc())) return jsonify({'msg': 'System Error', 'code': -1}), 500
def new(output_dir): """Create a new Toapi project.""" if os.path.exists(output_dir): logger.error('New project', 'Directory already exists.') return logger.info(Fore.GREEN, 'New project', 'Creating project directory "%s"' % output_dir) os.system('git clone https://github.com/toapi/toapi-template %s' % output_dir) os.system('rm -rf %s/.git' % output_dir) logger.info(Fore.GREEN, 'New project', 'Success!') click.echo('') click.echo(' cd %s' % output_dir) click.echo(' toapi run') click.echo('')
def handler(path): try: start_time = time() full_path = request.full_path.strip("?") results = self.parse_url(full_path) end_time = time() time_usage = end_time - start_time res = jsonify(results) logger.info( Fore.GREEN, "Received", "%s %s 200 %.2fms" % (request.url, len(res.response), time_usage * 1000), ) return res except Exception as e: logger.error("Serving", f"{e}") logger.error("Serving", "%s" % str(traceback.format_exc())) return jsonify({"msg": "System Error", "code": -1}), 500
def wrapper(*args, **kwargs): cache_key = key or kwargs.pop('dynamic_key', None) if isinstance(cache_config, dict): kwargs.update(cache_config) cache_ins = cache_class(serializer=serializer, **kwargs) try: if cache_ins.exists(cache_key): logger.info(Fore.YELLOW, 'Cache', 'Get<%s>' % cache_key) return cache_ins.get(cache_key, **kwargs) except Exception: logger.exception('Cache', 'Get<%s>' % cache_key) result = func(*args, **kwargs) if result and cache_key: try: if cache_ins.set(cache_key, result, ttl=ttl, **kwargs): logger.info(Fore.YELLOW, 'Cache', 'Set<%s>' % cache_key) except Exception: logger.exception('Cache', 'Set<%s>' % cache_key) return result
def parse_url(self, full_path: str) -> dict: results = self._cache.get(full_path) if results is not None: logger.info(Fore.YELLOW, "Cache", f"Get<{full_path}>") return results results = {} for source_format, target_format, item in self._routes: parsed_path = self.convert_string( full_path, source_format, target_format ) if parsed_path is not None: full_url = self.absolute_url(item._site, parsed_path) html = self.fetch(full_url) result = item.parse(html) logger.info( Fore.CYAN, "Parsed", f"Item<{item.__name__}[{len(result)}]>", ) results.update({item.__name__: result}) self._cache[full_path] = results logger.info(Fore.YELLOW, "Cache", f"Set<{full_path}>") return results
def page_not_found(error): start_time = time() path = request.full_path if path.endswith('?'): path = path[:-1] try: result = api.get_cache(path) or api.parse(path) if result is None: logger.error('Received', '%s 404' % request.url) return 'Not Found', 404 api.set_cache(path, result) res = jsonify(result) api.update_status('_status_received') end_time = time() time_usage = end_time - start_time logger.info( Fore.GREEN, 'Received', '%s %s 200 %.2fms' % (request.url, len(res.response), time_usage * 1000)) return res except Exception as e: return str(e)
def serve(self, ip='0.0.0.0', port='5000', debug=None, **options): """Todo: Serve as an api server powered by flask""" from flask import Flask, jsonify, request app = Flask(__name__) app.logger.setLevel(logging.ERROR) @app.errorhandler(404) def page_not_found(error): parse_result = urlparse(request.url) if parse_result.query != '': url = '{}?{}'.format(parse_result.path, parse_result.query) else: url = request.path try: res = jsonify(self.parse(url)) logger.info(Fore.GREEN, 'Received', '%s %s' % (request.url, len(res.response[0]))) return res except Exception as e: return str(e) logger.info(Fore.WHITE, 'Serving', 'http://%s:%s' % (ip, port)) app.run(ip, port, debug=False, **options)
def new(dir_or_project): """Create a new Toapi project. Giving a dir means start a default template, Example: toapi new api Giving a github project means start a github template. Example: toapi new toapi/toapi-one """ if '/' in dir_or_project: dir_name = dir_or_project.split('/')[-1] logger.info(Fore.GREEN, 'New project', 'Creating project directory "%s"' % dir_name) os.system('git clone https://github.com/%s %s' % (dir_or_project, dir_name)) os.system('rm -rf %s/.git' % dir_name) logger.info(Fore.GREEN, 'New project', 'Success!') click.echo('') click.echo(' cd %s' % dir_name) click.echo(' toapi run') click.echo('') else: if os.path.exists(dir_or_project): logger.error('New project', 'Directory already exists.') return logger.info(Fore.GREEN, 'New project', 'Creating project directory "%s"' % dir_or_project) os.system('git clone https://github.com/toapi/toapi-template %s' % dir_or_project) os.system('rm -rf %s/.git' % dir_or_project) logger.info(Fore.GREEN, 'New project', 'Success!') click.echo('') click.echo(' cd %s' % dir_or_project) click.echo(' toapi run') click.echo('')
def fetch(self, url: str) -> str: html = self._storage.get(url) if html is not None: logger.info(Fore.BLUE, 'Storage', f'Get<{url}>') return html r = requests.get(url) content = r.content charset = cchardet.detect(content) html = content.decode(charset['encoding'] or 'utf-8') logger.info(Fore.GREEN, 'Sent', f'{url} {len(html)} {r.status_code}') self._storage[url] = html logger.info(Fore.BLUE, 'Storage', f'Set<{url}>') return html
def fetch(self, url: str) -> str: html = self._storage.get(url) if html is not None: logger.info(Fore.BLUE, "Storage", f"Get<{url}>") return html if self.browser is not None: html = self.browser.get(url) else: r = requests.get(url) content = r.content charset = cchardet.detect(content) html = content.decode(charset["encoding"] or "utf-8") logger.info(Fore.GREEN, "Sent", f"{url} {len(html)}") self._storage[url] = html logger.info(Fore.BLUE, "Storage", f"Set<{url}>") return html
def stop(self, signal, frame): logger.info(Fore.WHITE, 'Server', 'Server Stopped') exit()
def serve(self, ip='127.0.0.1', port=5000, **options): try: logger.info(Fore.WHITE, 'Serving', 'http://%s:%s' % (ip, port)) self.app.run(ip, port, debug=False, **options) except KeyboardInterrupt: sys.exit()