def _filter_links(self, links, text=None, text_regex=None, url=None, url_regex=None, predicate=None): predicates = [] if text is not None: predicates.append(lambda link: link.string == text) if text_regex is not None: predicates.append( lambda link: re_compile(text_regex).search(link.string or '')) if url is not None: predicates.append(lambda link: link.get('href') == url) if url_regex is not None: predicates.append(lambda link: re_compile(url_regex).search( link.get('href', ''))) if predicate: predicate.append(predicate) def f(link): for p in predicates: if not p(link): return False return True return [link for link in links if f(link)]
def _match(self, mapping, value): for pat, what in utils.group(mapping, 2): rx = utils.re_compile('^' + pat + '$') result = rx.match(value) if result: return what, [x and urllib.unquote(x) for x in result.groups()] return None, None
def _match(self, mapping, value): for pat, what in utils.group(mapping, 2): rx = utils.re_compile("^" + pat + "$") result = rx.match(value) if result: return what, [x and urllib.unquote(x) for x in result.groups()] return None, None
def _match(self, mapping, value): for pat, what in utils.group(mapping, 2): if isinstance(what, basestring): what, result = utils.re_subm("^" + pat + "$", what, web.ctx.path) else: result = utils.re_compile("^" + pat + "$").match(web.ctx.path) if result: # it's a match return what, [x and urllib.unquote(x) for x in result.groups()] return None, None
def _match(self, mapping, value): for pat, what in mapping: if isinstance(what, basestring): what, result = utils.re_subm('^' + pat + '$', what, value) else: result = utils.re_compile('^' + pat + '$').match(value) if result: # it's a match return what, [x for x in set(result.groups()).difference(set(result.groupdict().values()))], result.groupdict() #microhuang return None, None, None
def _match(self, mapping, value): for pat, what in mapping: if isinstance(what, basestring): what, result = utils.re_subm('^' + pat + '$', what, value) else: result = utils.re_compile('^' + pat + '$').match(value) if result: # it's a match return what, [x for x in result.groups()] return None, None
def _filter_links(self, links, text=None, text_regex=None, url=None, url_regex=None, predicate=None): predicates = [] if text is not None: predicates.append(lambda link: link.string == text) if text_regex is not None: predicates.append(lambda link: re_compile(text_regex).search(link.string or "")) if url is not None: predicates.append(lambda link: link.get("href") == url) if url_regex is not None: predicates.append(lambda link: re_compile(url_regex).search(link.get("href", ""))) if predicate: predicate.append(predicate) def f(link): for p in predicates: if not p(link): return False return True return [link for link in links if f(link)]
def validates_email(self, key, email): msg = 'Invalid email address' if '@' not in email: raise ValidationError(msg) user, domain = email.split('@') user_regex = re_compile( r"(^[-!#$%&'*+/=?^_`{}|~0-9A-Z]+(\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*\Z" r'|^"([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\001-\011\013\014\016-\177])*"\Z)', re.IGNORECASE) if not user_regex.match(user): raise ValidationError(msg) domain_regex = re_compile( r'((?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+)(?:[A-Z0-9-]{2,63}(?<!-))\Z', re.IGNORECASE) if not domain_regex.match(domain): raise ValidationError(msg) return email
def compile_templates(root): """Compiles templates to python code.""" re_start = re_compile('^', re.M) for dirpath, dirnames, filenames in os.walk(root): filenames = [ f for f in filenames if not f.startswith('.') and not f.endswith('~') and not f.startswith('__init__.py') ] for d in dirnames[:]: if d.startswith('.'): dirnames.remove(d) # don't visit this dir out = open(os.path.join(dirpath, '__init__.py'), 'w') out.write('from web.template import CompiledTemplate, ForLoop\n\n') if dirnames: out.write("import " + ", ".join(dirnames)) for f in filenames: path = os.path.join(dirpath, f) if '.' in f: name, _ = f.split('.', 1) else: name = f text = open(path).read() text = Template.normalize_text(text) code = Template.generate_code(text, path) code = re_start.sub(' ', code) _gen = '' + \ '\ndef %s():' + \ '\n loop = ForLoop()' + \ '\n _dummy = CompiledTemplate(lambda: None, "dummy")' + \ '\n join_ = _dummy._join' + \ '\n escape_ = _dummy._escape' + \ '\n' + \ '\n%s' + \ '\n return __template__' gen_code = _gen % (name, code) out.write(gen_code) out.write('\n\n') out.write('%s = CompiledTemplate(%s(), %s)\n\n' % (name, name, repr(path))) # create template to make sure it compiles t = Template(open(path).read(), path) out.close()
def compile_templates(root): """Compiles templates to python code.""" re_start = re_compile('^', re.M) for dirpath, dirnames, filenames in os.walk(root): filenames = [ f for f in filenames if not f.startswith('.') and not f.endswith('~') and not f.startswith('__init__.py') ] for d in dirnames[:]: if d.startswith('.'): dirnames.remove(d) # don't visit this dir out = open(os.path.join(dirpath, '__init__.py'), 'w') out.write( 'from web.template import CompiledTemplate, ForLoop, TemplateResult\n\n' ) if dirnames: out.write("import " + ", ".join(dirnames)) out.write("_dummy = CompiledTemplate(lambda: None, 'dummy')\n") out.write("join_ = _dummy._join\n") out.write("escape_ = _dummy._escape\n") out.write("\n") for f in filenames: path = os.path.join(dirpath, f) if '.' in f: name, _ = f.split('.', 1) else: name = f text = open(path).read() text = Template.normalize_text(text) code = Template.generate_code(text, path) code = code.replace("__template__", name, 1) out.write(code) out.write('\n\n') out.write('%s = CompiledTemplate(%s, %s)\n\n' % (name, name, repr(path))) # create template to make sure it compiles t = Template(open(path).read(), path) out.close()
def _match(self, mapping, value): for pat, what in mapping: if isinstance(what, application): if value.startswith(pat): f = lambda: self._delegate_sub_application(pat, what) return f, None else: continue elif isinstance(what, basestring): what, result = utils.re_subm('^' + pat + '$', what, value) else: result = utils.re_compile('^' + pat + '$').match(value) if result: # it's a match return what, [x for x in result.groups()] return None, None
def _match(self, mapping, value): for pat, what in utils.group(mapping, 2): if isinstance(what, application): if value.startswith(pat): f = lambda: self._delegate_sub_application(pat, what) return f, None else: continue elif isinstance(what, basestring): what, result = utils.re_subm("^" + pat + "$", what, value) else: result = utils.re_compile("^" + pat + "$").match(value) if result: # it's a match return what, [x and urllib.unquote(x) for x in result.groups()] return None, None
def compile_templates(root): """Compiles templates to python code.""" re_start = re_compile('^', re.M) for dirpath, dirnames, filenames in os.walk(root): filenames = [f for f in filenames if not f.startswith('.') and not f.endswith('~') and not f.startswith('__init__.py')] for d in dirnames[:]: if d.startswith('.'): dirnames.remove(d) # don't visit this dir out = open(os.path.join(dirpath, '__init__.py'), 'w') out.write('from web.template import CompiledTemplate, ForLoop\n\n') if dirnames: out.write("import " + ", ".join(dirnames)) for f in filenames: path = os.path.join(dirpath, f) if '.' in f: name, _ = f.split('.', 1) else: name = f text = open(path).read() text = Template.normalize_text(text) code = Template.generate_code(text, path) code = re_start.sub(' ', code) _gen = '' + \ '\ndef %s():' + \ '\n loop = ForLoop()' + \ '\n _dummy = CompiledTemplate(lambda: None, "dummy")' + \ '\n join_ = _dummy._join' + \ '\n escape_ = _dummy._escape' + \ '\n' + \ '\n%s' + \ '\n return __template__' gen_code = _gen % (name, code) out.write(gen_code) out.write('\n\n') out.write('%s = CompiledTemplate(%s, %s)\n\n' % (name, name, repr(path))) # create template to make sure it compiles t = Template(open(path).read(), path) out.close()
def compile_templates(root): """Compiles templates to python code.""" re_start = re_compile("^", re.M) for dirpath, dirnames, filenames in os.walk(root): filenames = [ f for f in filenames if not f.startswith(".") and not f.endswith("~") and not f.startswith("__init__.py") ] out = open(os.path.join(dirpath, "__init__.py"), "w") out.write("from web.template import CompiledTemplate, ForLoop\n\n") if dirnames: out.write("import " + ", ".join(dirnames)) for f in filenames: path = os.path.join(dirpath, f) # create template to make sure it compiles t = Template(open(path).read(), path) if "." in f: name, _ = f.split(".", 1) else: name = f code = Template.generate_code(open(path).read(), path) code = re_start.sub(" ", code) _gen = ( "" + "\ndef %s():" + "\n loop = ForLoop()" + '\n _dummy = CompiledTemplate(lambda: None, "dummy")' + "\n join_ = _dummy._join" + "\n escape_ = _dummy._escape" + "\n" + "\n%s" + "\n return __template__" ) gen_code = _gen % (name, code) out.write(gen_code) out.write("\n\n") out.write("%s = CompiledTemplate(%s(), %s)\n\n" % (name, name, repr(path))) out.close()
def compile_templates(root): """Compiles templates to python code.""" re_start = re_compile("^", re.M) for dirpath, dirnames, filenames in os.walk(root): filenames = [ f for f in filenames if not f.startswith(".") and not f.endswith("~") and not f.startswith("__init__.py") ] for d in dirnames[:]: if d.startswith("."): dirnames.remove(d) # don't visit this dir out = open(os.path.join(dirpath, "__init__.py"), "w") out.write("from web.template import CompiledTemplate, ForLoop, TemplateResult\n\n") if dirnames: out.write("import " + ", ".join(dirnames)) out.write("_dummy = CompiledTemplate(lambda: None, 'dummy')\n") out.write("join_ = _dummy._join\n") out.write("escape_ = _dummy._escape\n") out.write("\n") for f in filenames: path = os.path.join(dirpath, f) if "." in f: name, _ = f.split(".", 1) else: name = f text = open(path).read() text = Template.normalize_text(text) code = Template.generate_code(text, path) code = code.replace("__template__", name, 1) out.write(code) out.write("\n\n") out.write("%s = CompiledTemplate(%s, %s)\n\n" % (name, name, repr(path))) # create template to make sure it compiles t = Template(open(path).read(), path) out.close()
def compile_templates(root): """Compiles templates to python code.""" re_start = re_compile('^', re.M) for dirpath, dirnames, filenames in os.walk(root): filenames = [f for f in filenames if not f.startswith('.') and not f.endswith('~') and not f.startswith('__init__.py')] for d in dirnames[:]: if d.startswith('.'): dirnames.remove(d) # don't visit this dir out = open(os.path.join(dirpath, '__init__.py'), 'w') out.write('from web.template import CompiledTemplate, ForLoop, TemplateResult\n\n') if dirnames: out.write("import " + ", ".join(dirnames)) out.write("\n") for f in filenames: path = os.path.join(dirpath, f) if '.' in f: name, _ = f.split('.', 1) else: name = f text = open(path).read() text = Template.normalize_text(text) code = Template.generate_code(text, path) code = code.replace("__template__", name, 1) # inject "join_ = ..; escape_ = .." into the code. # That is required to make escape functionality work correctly. code = code.replace("\n", "\n join_ = %s._join; escape_ = %s._escape\n" % (name, name), 1) out.write(code) out.write('\n\n') out.write('%s = CompiledTemplate(%s, %s)\n\n' % (name, name, repr(path))) # create template to make sure it compiles t = Template(open(path).read(), path) out.close()
def find_indent(text): rx = re_compile(' +') match = rx.match(text) first_indent = match and match.group(0) return first_indent or ""
__all__ = ["render"] import re, urlparse, pprint, traceback, sys from Cheetah.Compiler import Compiler from Cheetah.Filters import Filter from utils import re_compile, memoize, dictadd from net import htmlquote, websafe from webapi import ctx, header, output, input, cookies, loadhooks def upvars(level=2): """Guido van Rossum sez: don't use this function.""" return dictadd( sys._getframe(level).f_globals, sys._getframe(level).f_locals) r_include = re_compile(r'(?!\\)#include \"(.*?)\"($|#)', re.M) def __compiletemplate(template, base=None, isString=False): if isString: text = template else: text = open('templates/'+template).read() # implement #include at compile-time def do_include(match): text = open('templates/'+match.groups()[0]).read() return text while r_include.findall(text): text = r_include.sub(do_include, text) execspace = _compiletemplate.bases.copy() tmpl_compiler = Compiler(source=text, mainClassName='GenTemplate') tmpl_compiler.addImportedVarNames(execspace.keys())
def _vaild_session_id(self,session_id): rx = utils.re_compile('^[0-9a-fA-F]+$') if rx.match(session_id): return True return False
def ssdut_news_list(page_raw): ''' parse the news_list page, get a list of news, the same squence as the page, result.soup .page_no .news_list .total_records ''' result = Storage() soup = bsoup(page_raw) result.soup = soup # get current page number r = soup.find(text=ur"\u4e0b\u4e00\u9875") # text=u"下一页" if r: '''not the last page''' next_page_link = r.parent.attrs[0][1] #logging.debug("r.parent.attrs = %r" % r.parent.attrs) r = re_compile(r'/p/(\d+)') page_no = r.search(next_page_link).group(1) page_no = int(page_no) # - 1 else: ''' the last page''' r = soup.find(text=ur'\u4e0a\u4e00\u9875') prev_page_link = r.parent.attrs[0][1] #logging.debug("r.parent.attrs = %r" % r.parent.attrs) r = re_compile(r'/p/(\d+)') page_no = r.search(prev_page_link).group(1) page_no = int(page_no) # + 1 result.page_no = page_no # get the news list res = soup.findAll(attrs={"bgcolor": "#EEEEEE"}) news_list = [] counter = 1 for r in res: a = r.findChildren("a") date_str = r.find(text=re_compile("\d{4}-\d{2}-\d{2}")).encode("utf-8") news_list.append({ "link": a[0].get("href").encode("utf-8"), "title": a[0].text.encode("utf-8"), "source": a[1].text.encode("utf-8"), "source_link": a[1].get("href").encode("utf-8"), "date_str": date_str, "date": datetime.date(*[int(n) for n in date_str.split("-")]), "no": counter, }) counter += 1 #logging.debug("source = %s, source_link = %s" % # (news_list[-1]['source'], news_list[-1]['source_link'])) result.news_list = news_list # tital news num # 共\d+ t条记录 s = soup.find(text=re_compile(ur"\u5171\d+ \u6761\u8bb0\u5f55")) r = re_compile(ur"\u5171(\d+)") result.total_records = int(r.search(s).group(1)) return result
def _valid_session_id(self, session_id): rx = utils.re_compile('^[0-9a-fA-F]+$') return rx.match(session_id)
__all__ = ["render"] import re, urlparse, pprint, traceback, sys from Cheetah.Compiler import Compiler from Cheetah.Filters import Filter from utils import re_compile, memoize, dictadd from net import htmlquote, websafe from webapi import ctx, header, output, input, cookies def upvars(level=2): """Guido van Rossum sez: don't use this function.""" return dictadd( sys._getframe(level).f_globals, sys._getframe(level).f_locals) r_include = re_compile(r'(?!\\)#include \"(.*?)\"($|#)', re.M) def __compiletemplate(template, base=None, isString=False): if isString: text = template else: text = open('templates/'+template).read() # implement #include at compile-time def do_include(match): text = open('templates/'+match.groups()[0]).read() return text while r_include.findall(text): text = r_include.sub(do_include, text) execspace = _compiletemplate.bases.copy() tmpl_compiler = Compiler(source=text, mainClassName='GenTemplate') tmpl_compiler.addImportedVarNames(execspace.keys())
def ssdut_news_parse(raw): ''' parse the raw page src, store all result in a Storage object. all strings are unicode result.soup BeautifulSoup object result.raw raw page src result.hash sha1 hash of the page result.title title result.source 来源 result.date_str - date in string result.date - date object result.body html src of the news body result.clean_body unescaped src of the news body, result.publisher 发表人 ''' soup = bsoup(raw) result = Storage() # raw page / hash result.raw = raw result.soup = soup # title s = soup.find(attrs={'class': re_compile('title')}) result.title = s.text # source text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn")) r = re_compile( ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:") res = r.findall(text)[0] result.source = res[1].rstrip() # date result.date_str = res[0] result.date = datetime.date(*[int(n) for n in result.date_str.split('-')]) # content (body) c = soup.find(attrs={'class': re_compile('content')}) result.body = unicode(c) # content (body) unescaped texts = c.findAll(text=True) all_texts = '\n'.join(texts) result.clean_body = html_parser.unescape(all_texts) # publisher (could be find at the bottom of page) s = soup.find( attrs={ "style": "font-size:14px;float:left;text-align:right;width:80%" }) r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)") #logging.debug("publisher string = %r " % s) try: name = r.findall(s.text)[0] except: logging.warn(" %s has no publisher " % result.title) name = "" # no publisher: like this: index.php/News/8692.html result.publisher = name.rstrip().lstrip() # use utf-8 encoding for k in ['title', 'source', 'body', 'clean_body', 'publisher']: result[k] = result[k].encode('utf-8') hash_src = result.body + result.title + result.publisher if isinstance(hash_src, str): hash_src = unicode(hash_src, "utf-8", "ignore") elif isinstance(hash_src, unicode): pass else: pass result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest() result.search_text = ''.join([result.title, result.source, result.clean_body, result.publisher, result.sha1]) return result
def _vaild_session_id(self, session_id): rx = utils.re_compile('^[0-9a-fA-F]+$') if rx.match(session_id): return True return False
def ssdut_news_parse(raw): ''' parse the raw page src, store all result in a Storage object. all strings are unicode result.soup BeautifulSoup object result.raw raw page src result.hash sha1 hash of the page result.title title result.source 来源 result.date_str - date in string result.date - date object result.body html src of the news body result.clean_body unescaped src of the news body, result.publisher 发表人 ''' soup = bsoup(raw) result = Storage() # raw page / hash result.raw = raw result.soup = soup # title s = soup.find(attrs={'class': re_compile('title')}) result.title = s.text # source text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn")) r = re_compile(ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:") res = r.findall(text)[0] result.source = res[1].rstrip() # date result.date_str = res[0] result.date = datetime.date(*[int(n) for n in result.date_str.split('-')]) # content (body) c = soup.find(attrs={'class': re_compile('content')}) result.body = unicode(c) # content (body) unescaped texts = c.findAll(text=True) all_texts = '\n'.join(texts) result.clean_body = html_parser.unescape(all_texts) # publisher (could be find at the bottom of page) s = soup.find( attrs={ "style": "font-size:14px;float:left;text-align:right;width:80%" }) r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)") #logging.debug("publisher string = %r " % s) try: name = r.findall(s.text)[0] except: logging.warn(" %s has no publisher " % result.title) name = "" # no publisher: like this: index.php/News/8692.html result.publisher = name.rstrip().lstrip() # use utf-8 encoding for k in ['title', 'source', 'body', 'clean_body', 'publisher']: result[k] = result[k].encode('utf-8') hash_src = result.body + result.title + result.publisher if isinstance(hash_src, str): hash_src = unicode(hash_src, "utf-8", "ignore") elif isinstance(hash_src, unicode): pass else: pass result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest() result.search_text = ''.join([ result.title, result.source, result.clean_body, result.publisher, result.sha1 ]) return result
def ssdut_news_list(page_raw): ''' parse the news_list page, get a list of news, the same squence as the page, result.soup .page_no .news_list .total_records ''' result = Storage() soup = bsoup(page_raw) result.soup = soup # get current page number r = soup.find(text=ur"\u4e0b\u4e00\u9875") # text=u"下一页" if r: '''not the last page''' next_page_link = r.parent.attrs[0][1] #logging.debug("r.parent.attrs = %r" % r.parent.attrs) r = re_compile(r'/p/(\d+)') page_no = r.search(next_page_link).group(1) page_no = int(page_no) # - 1 else: ''' the last page''' r = soup.find(text=ur'\u4e0a\u4e00\u9875') prev_page_link = r.parent.attrs[0][1] #logging.debug("r.parent.attrs = %r" % r.parent.attrs) r = re_compile(r'/p/(\d+)') page_no = r.search(prev_page_link).group(1) page_no = int(page_no) # + 1 result.page_no = page_no # get the news list res = soup.findAll(attrs={"bgcolor": "#EEEEEE"}) news_list = [] counter = 1 for r in res: a = r.findChildren("a") date_str = r.find(text=re_compile("\d{4}-\d{2}-\d{2}")).encode("utf-8") news_list.append( { "link": a[0].get("href").encode("utf-8"), "title": a[0].text.encode("utf-8"), "source": a[1].text.encode("utf-8"), "source_link": a[1].get("href").encode("utf-8"), "date_str": date_str, "date": datetime.date( *[int(n) for n in date_str.split("-")]), "no": counter, }) counter += 1 #logging.debug("source = %s, source_link = %s" % # (news_list[-1]['source'], news_list[-1]['source_link'])) result.news_list = news_list # tital news num # 共\d+ t条记录 s = soup.find(text=re_compile(ur"\u5171\d+ \u6761\u8bb0\u5f55")) r = re_compile(ur"\u5171(\d+)") result.total_records = int(r.search(s).group(1)) return result