import pickle import os.path from pprint import pprint from concurrent.futures import ProcessPoolExecutor from concurrent.futures import as_completed from lxml.html.clean import Cleaner from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer from collections import Counter not_word_chars = re.compile(r'\s+|[^\w]|\s+[^\w]*|[^A-Za-z]') stemmer = SnowballStemmer('english') cleaner = Cleaner(scripts=True, javascript=True, comments=True, style=True, embedded=True, forms=True, annoying_tags=True) num_docs = 0 class Document(object): def __init__(self, path, tokens, counter): self.path = path self.tokens = tokens self.counter = counter @property def doc_length(self): return math.sqrt(
from nxs_utils import ThreadPool, Timer from subprocess import call import subprocess import seleniumclient import xml.etree.ElementTree as ET import re, lxml from lxml.html.clean import Cleaner WORKERS = 1 siteBase = "https://bed-search.nextprot.org/" sitemapUrl = siteBase + "sitemap.xml" #Where to save static site dirlocation = "/work/tmp/static-site/" cleaner = Cleaner() #cleaner.scripts = True # This is True because we want to activate the javascript filter cleaner.scripts = True # This is True because we want to activate the javascript filter def saveToFile(content, filename): text_file = open(filename, "w") text_file.write(content.encode('UTF-8')) text_file.close() print str(incrementCounter()) + " creating file " + filename + " " sys.stdout.flush() def createDirectoryStructureIfNeeded(URLS): for url in URLS: filename = getFilename(url)
def scrape_links(links): maincleaner = Cleaner(allow_tags=['div'], remove_unknown_tags=False, remove_tags=['div']) # funtion to remove every tag # while True: for link in links: # Loop through all the links if link == last_link: # Check if this link has already been scraped (this will eventually be changed to check dates) break # If we've hit something we've already scraped, break out of the loop # try: linkhtml = scraperwiki.scrape(link).decode( 'Windows-1252' ) # scrape the contents of the current link and decode from Windows-1252 encoding print link root = lxml.html.fromstring( linkhtml) # turn scraped content into an HTML object # GET TITLE title = root.cssselect("h1")[0].text.encode( 'utf-8' ) # grab the page header (title) and return its text as unicode title = replace_all( title, subDic) # replace alphanumeric obfuscations with letters # GET DATE date = root.cssselect( "div.adInfo" )[0].text # get the text of the html entity that contains the date and time of the post cleandate = re.sub(r'(\S+\s+\d+,\s+\d\d\d\d)(?:,?) (\d+\:\d+ \w\w)', r'\1 \2', date.strip()) # get date into a standard format cleandate = re.search(r'\S+\s+\d+, \d\d\d\d \d+\:\d+ \w\w', cleandate).group( 0) # find the date string on the page rawdate = datetime.strptime( cleandate, '%B %d, %Y %I:%M %p' ) # encode the date as a date using format Month dd, YYYY date = rawdate.strftime( '%Y-%m-%d %H:%M' ) # decode that date back into a string of format YYYY-mm-dd # GET MAIN BODY TEXT mainwithtags = root.cssselect("div.postingBody")[ 0] # grabs the body text of the post main = maincleaner.clean_html(mainwithtags).text.encode( 'utf-8') # gets rid of all HTML tags main = replace_all( main, subDic) # replace alphanumeric obfuscations with letters # GET PHONE NUMBER(S) stripped = replace_all( main.lower(), wordDic ) # replaces common phone number obfuscations with actual numbers phonecomp = re.compile("[\s\-/=\.,{}_\!\@\#\$\%\^\&\*\(\)\~]" ) # list of known phone number dividers stripped = phonecomp.sub('', stripped) # remove phone number dividers phone = re.findall( r'(?:1?)[1-9]\d{9}', stripped ) # search for groups of 10 consecutive numbers (with an optional preceding 1) phone = list( set(phone) ) # gets rid of duplicate numbers by turning list into a set and back phone = ", ".join( phone) # formats phone numbers as "phone1, phone2,... phoneN" # GET LISTED AGE if root.cssselect( "p.metaInfoDisplay"): # does the entry have metainfo? listedage = root.cssselect("p.metaInfoDisplay")[ 0] # get the the first html metainfo element listedage = re.sub( "[^\d]", "", listedage.text ) # get rid of all non-numeric text in the text of the element else: # if there's no metainfo listedage = "" # set the listed age to an empty string # GET LOCATION if re.findall(r'Location\:(.*?)\</div\>', linkhtml, flags=re.DOTALL): # location = re.findall('Location\:(.*?)\</div\>', linkhtml, flags=re.DOTALL)[0].encode('utf-8') # location = removeNonAscii(location) print repr(location) else: location = "" picturelist = [] pictures = root.cssselect('ul#viewAdPhotoLayout img') for i in range(len(pictures)): largepic = re.sub('/medium/', '/large/', pictures[i].get('src')) picturelist.append(largepic) print picturelist picturelist = " ".join(picturelist) x = urllib.urlopen(largepic).read() piccode = base64.encodestring(x) print piccode # except: # print 'FAILED TO LOAD: ' + link # continue # record = {} # record['Title'] = 'LOAD FAILURE' # Set up our data record - we'll need it later record = {} record['Title'] = title #.encode('ascii', 'ignore').strip() record['Date'] = date record['Main'] = main #.encode('ascii', 'ignore').strip() record['Pictures'] = picturelist record['Phone'] = phone record['Listed Age'] = listedage #.encode('ascii', 'ignore').strip() record['Location'] = location record['PicCode'] = piccode #.encode('ascii', 'ignore').strip() # Print out the data we've gathered #print record, '------------' # Finally, save the record to the datastore - 'Artist' is our unique key scraperwiki.sqlite.save(["Title"], record)
body.rewrite_links(myRewriteLink) f = open(fileout,"wb") f.write(html.tostring(body)) f.close() if len(sys.argv) != 3: usage() dirin = sys.argv[1] dirout= sys.argv[2] cleaner = Cleaner(style=True) import zipfile with zipfile.ZipFile(dirin,'r') as myzip: for orig in myzip.namelist(): print(orig) dest = os.path.join(dirout,orig) if orig.endswith('/'): os.makedirs(dest,exist_ok=True) elif orig.endswith(".html"): f = myzip.read(orig) convert(f,dest) else : myzip.extract(orig,dirout)
using_sysrandom = True except NotImplementedError: using_sysrandom = False SECRET = random.randint(0, 1000000) logger = logging.getLogger('castle.cms') ANONYMOUS_USER = "******" _truncate_cleaner = Cleaner(scripts=True, javascript=True, comments=True, style=True, links=True, meta=True, page_structure=True, embedded=True, frames=True, forms=True, annoying_tags=True, remove_tags=('div', ), kill_tags=('img', 'hr'), remove_unknown_tags=True) def truncate_text(text, max_words=30, more_link=None, clean=False): """ adapted from Django """ if not isinstance(text, basestring): return ''
import lxml.etree from lxml.html.clean import Cleaner from w3lib.html import strip_html5_whitespace import html_text from extruct.utils import parse_html # Cleaner which is similar to html_text cleaner, but is less aggressive cleaner = Cleaner( scripts=True, javascript=False, # onclick attributes are fine comments=True, style=True, links=True, meta=True, page_structure=False, # <title> may be nice to have processing_instructions=True, embedded=False, # keep embedded content frames=False, # keep frames forms=False, # keep forms annoying_tags=False, remove_unknown_tags=False, safe_attrs_only=False, ) class LxmlMicrodataExtractor(object): _xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]') _xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop], .//*[@itemscope]//*[@itemprop])""", namespaces={"set": "http://exslt.org/sets"}) _xp_clean_text = lxml.etree.XPath(
get_class_weight, get_link_density, is_unlikely_node, score_candidates, ) from .utils import cached_property, shrink_text html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, style=True, links=True, meta=False, add_nofollow=False, page_structure=False, processing_instructions=True, embedded=False, frames=False, forms=False, annoying_tags=False, remove_tags=None, kill_tags=("noscript", "iframe"), remove_unknown_tags=False, safe_attrs_only=False) SCORABLE_TAGS = ("div", "p", "td", "pre", "article") ANNOTATION_TAGS = ( "a", "abbr", "acronym", "b",
strings = [strings] strings = [_collect_string_content(x) for x in strings] res = [] for s in strings: try: res.append(s.replace(old, new)) except: pass return res _replace_line = re.compile("(<br>)|(</p>)|(</li>|</div>)", re.I | re.S) _replace_blank = re.compile("""(?:<.*?>)|\r|\t""", re.I | re.S) _remove_multi_line = re.compile('\n+', re.I | re.S) _special_char = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") _html_cleaner = Cleaner(style=True) def _unescape(s): if '&' not in s: return s def replaceEntities(s): s = s.groups()[0] special_char_dict = { "amp": "&", "lt": "<", "gt": ">", "nbsp": " ", } try:
def html_clean(str): """ Clean up HTML to be safe """ cleaner = Cleaner(safe_attrs_only=True) return cleaner.clean_html(str)
SearchEngine = namedtuple('Engine', ['name', 'api', 'charset']) search_engine = [ SearchEngine(name='BAIDU', api='http://www.baidu.com/s?wd={}', charset='utf-8'), # SearchEngine(name='SOGOU', api='http://www.sogou.com/web?query={}', charset='utf-8'), SearchEngine(name='BDZD', api='https://zhidao.baidu.com/search?word={}', charset='gbk'), # SearchEngine(name='BING', api='http://cn.bing.com/search?q={}', charset='utf-8'), # SearchEngine(name='GOOGLE', api='https://www.google.com.hk/search?newwindow=1&hl=zh-CN&q={}', charset='utf-8'), ] category_list = ['summary', 'option'] # category_list = ['summary'] old_question = [] cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False) headers = { 'Host': 'msg.api.chongdingdahui.com', 'User-Agent': 'LiveTrivia/1.0.4 (com.chongdingdahui.app; build:0.1.7; iOS 11.2.2) Alamofire/4.6.0', 'X-Live-App-Version': '1.0.4', 'Content-Type': 'application/json', 'X-Live-Device-Identifier': 'AC654DF3-402D-40B3-BF20-19D8A5B57793', # 'X-Live-Session-Token': '1.3071218.845633.ZtC.d650c387f4c187ce54b2ea432bfa4f51', 'X-Live-Session-Token': '1.3071218.2811521.lao.5dc60efb955f70eb1e981ceb422e0eae', 'X-Live-Device-Type': 'ios', 'X-Live-OS-Version': 'Version 11.2.2 (Build 15C202)', } search_header = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
def clean_html(text): from lxml.html.clean import Cleaner return Cleaner(links=False, style=True).clean_html(text)
import lxml.html from lxml.html.clean import Cleaner # ダウンロードした XHTML ファイルのファイル名を書きます。 # ちなみに 789_14547.html は《吾輩は猫である》です。 FILE_NAME = 'data/xhtml/789_14547.html' with open(FILE_NAME, encoding='shift_jis') as f: data = f.read().encode('shift_jis') cleaner = Cleaner(page_structure=False, remove_tags=( 'ruby', 'br'), kill_tags=('rt', 'rp')) cln_html = cleaner.clean_html(data).decode('utf-8') plain_text = lxml.html.fromstring(cln_html).find_class('main_text')[ 0].text_content() # print(plain_text) # 別ファイルへの保存 PLAIN_TEXT = FILE_NAME.replace('xhtml', 'text').replace('.html', '.txt') print(PLAIN_TEXT) with open(PLAIN_TEXT, 'w') as f: f.write(plain_text)
def cmd_clean(root, **kwargs): return Cleaner(**kwargs).clean_html(root)
# ----------------------------------------------------------------------------- # 1. Parse the raw HTML to get the interesting bits - the part inside <td> tags. # -- UNCOMMENT THE 6 LINES BELOW (i.e. delete the # at the start of the lines) # -- CLICK THE 'RUN' BUTTON BELOW # Check the 'Console' tab again, and you'll see how we're extracting # the HTML that was inside <td></td> tags. # We use lxml, which is a Python library especially for parsing html. # ----------------------------------------------------------------------------- html = html.replace('<br>', ' ') html = re.sub(r'(\&.*?;)|(\n|\t|\r)', ' ', html) print html issues = [] root = lxml.html.fromstring(html) # turn our HTML into an lxml object cleaner = Cleaner(remove_tags=['font', 'span'], links=False, remove_unknown_tags=False) root = cleaner.clean_html(root) newhtml = lxml.html.tostring(root) record = {} datestring = re.findall("Updated (.*?)</p>", newhtml)[0] date = time.strptime( datestring, '%b %d, %Y') # encode the date as a date using format Month dd, YYYY date = time.strftime( '%Y-%m-%d', date) # decode that date back into a string of format YYYY-mm-dd if scraperwiki.sqlite.get_var( 'last_update'
from lxml import etree from lxml.html.clean import Cleaner import sys doc = open(sys.argv[1]).read() cleaner = Cleaner(page_structure=False) doc = cleaner.clean_html(doc) tree = etree.HTML(doc) # gives the heading # heading = tree.xpath('//*[@id="firstClickFreeAllowed"]/div[1]/div/div[1]/div/div/div/header/h2') # print etree.tostring(heading[0],pretty_print=True) # gives the defination definations = tree.xpath('//*[@id="firstClickFreeAllowed"]/div[1]/div') totalmeanings = len(definations) i = 1 print ''' <!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" class="no-js"> <body class=" ad_trick"> ''' for defi in definations: print etree.tostring( tree.xpath('//*[@id="firstClickFreeAllowed"]/div[1]/div[' + str(i) + ']/div[1]')[0], pretty_print=True) i = i + 1
class feed_reader: """parse a list of feeds and return details as dictionary data""" #create the html cleaner, this is to clean out unwanted html tags in the description text #page_structure=True,remove_unknown_tags=True html_cleaner = Cleaner() html_cleaner.javascript = True html_cleaner.style = True html_cleaner.remove_tags = [ 'script', 'iframe', 'link', 'style', 'img', 'div' ] #~ html_cleaner.allow_tags = ['a', 'p', 'strong'] filter_by_date_expire = datetime.datetime.now() - datetime.timedelta( days=int(1.5 * 365)) # 1 and a half years ago html_img_cleaner = Cleaner(allow_tags=['img'], remove_unknown_tags=False) html_img_cleaner.allow_tags = ['img'] html_parser = lxml.etree.HTMLParser() xml_parser = lxml.etree.XMLParser(remove_blank_text=True, ns_clean=True, encoding='utf-8') enable_date_filter = True def __init__(self, feed_details, timeout=5): self.results = {} for feed_info in feed_details: self.url = feed_info.get('url') self.author = feed_info.get('author') self.tags = feed_info.get('tags') if feed_info.get('url').startswith('http:'): try: response = requests.get(feed_info.get('url'), stream=True, timeout=timeout) except requests.exceptions.Timeout as e: continue if response.headers.get('content-encoding') == 'gzip': response.raw.read = functools.partial(response.raw.read, decode_content=True) try: self.feed = lxml.etree.parse(response.raw, self.xml_parser) except: continue else: with open(os.path.abspath(feed_info.get('url')), 'r') as file_stream: try: self.feed = lxml.etree.parse(file_stream, self.xml_parser) except: continue self.feed = self.feed.getroot() # rss feed defaults self.channel_image = self.fetch_node_text(self.feed, 'channel/image/url', '') self.parse_feed() def convert_rfc822_to_datetime(self, rfcdate): """rss uses rfc822 dates so lets convert them to datetime for use later""" if len(rfcdate): parsed_rfcdate = parsedate_tz(rfcdate) if not parsed_rfcdate: return None return datetime.datetime.fromtimestamp( mktime_tz(parsed_rfcdate), pytz.utc).replace(tzinfo=None) return None def clean_up_text(self, text): """strip out any dirty tags like <script> they may break the sites""" if text is None: return '' cleaned_html = self.html_cleaner.clean_html(text) # parse large text seperately if len(text) > 600: description = lxml.etree.parse(StringIO.StringIO(cleaned_html), self.html_parser) root = description.getroot() build = '' for node in root[-1][-1].iter(): #skip any nodes with no text if node.text is None and node.tail is None: continue # we may want to do some other node checks here # perhaps count paragraphs, html layout changes a lot if node.tag == 'br': return build else: if node.tag == 'a' and node.text is None: build += node.tail else: build += etree.tostring(node) return self.html_cleaner.clean_html(text) def fetch_image_from_node_text(self, text): description = lxml.etree.parse(StringIO.StringIO(text), self.html_parser) for image in description.xpath('.//img'): return image.get('src') return None def fetch_image(self, node): """Try and get an image from an item in the feed, use various fall back methods""" image = node.xpath('media:thumbnail', namespaces=namespaces) if image: return image[0].get('url', '') # no media:thumbnail so lets try and grab an image from content:encoded image = node.xpath('content:encoded', namespaces=namespaces) if image: image = self.fetch_image_from_node_text(image[0].text) if image: return image # final attempt at getting an image from the item using description result = self.fetch_node_text(node, 'description') if result: image = self.fetch_image_from_node_text(result) if image: return image # no image so lets fall back to the channel image if it exists return self.channel_image def fetch_node_text(self, node, name, default=''): """fetch the text from the node we are given, we are working in unicode so decode byte strings to unicode""" result = node.xpath('./%s' % name) if result is None or len(result) is 0: return default if type(result[-1].text) is str: return result[-1].text.decode('utf8') else: return result[-1].text def fetch_node_attribute(self, node, name, attribs, default): result = node.xpath('./%s' % name) if result: return result.get(attribs, '') else: return default def format_author(self, author): """extract the authors name from the author text node""" return author.split('(')[-1].strip(')') def filter_by_tags(self, node, tags=None): """filter the feed out by category tag, if no tags assume its pre filtered""" if self.tags is None: return True for category in node.xpath('./category', namespaces=namespaces): if category.text.lower() in self.tags: return True return False def filter_by_date(self, date): """filter the feed out by date""" if self.enable_date_filter is False: return True if date > self.filter_by_date_expire: return True return False def parse_feed(self): """Parse the items in the feed, filter out bad data and put in defaults""" for item in self.feed.xpath('.//item', namespaces=namespaces): date = self.convert_rfc822_to_datetime( self.fetch_node_text(item, 'pubDate')) if self.filter_by_date(date) and self.filter_by_tags(item): author = self.format_author( self.fetch_node_text(item, 'author', self.author)) self.results.setdefault(author, []).append({ #~ self.results.append({ 'title': self.fetch_node_text(item, 'title'), 'date': date, 'url': self.fetch_node_text(item, 'link'), 'author': author, 'image': self.fetch_image(item), 'description': self.clean_up_text( self.fetch_node_text(item, 'description')) }) #order authors articles by date for author in self.results.keys(): self.results[author] = sorted(self.results[author], key=itemgetter('date'), reverse=True) def alternate_dict_and_sort_by_list_item_key(self, dict_of_lists, sort_key='date'): """ take a dictonary of ordered lists, step through each row and sort the current item position in each list and yield the result. basically gives the ordering of date while stepping through the blog entries to make it fair for people who do not blog often. """ longest_list_length = max( [len(dict_of_lists[d]) for d in dict_of_lists.keys()]) for i in xrange(0, longest_list_length): #get first value from each key, and order the list results = sorted([d.pop() for d in dict_of_lists.values() if d], key=itemgetter(sort_key), reverse=True) for item in results: yield item def __iter__(self): """return results ordered by date""" for author in self.alternate_dict_and_sort_by_list_item_key( self.results): yield author
from app.chat.forms import LoginForm, RoomAddForm, ChangeNicknameForm from app.chat.models import User, ROLE_USER, Room from config import DATABASE_QUERY_TIMEOUT, OPENID_PROVIDERS from flask import g, render_template, flash, url_for, request, session, redirect from flask.ext.login import current_user, logout_user, login_user, login_required from flask.ext.socketio import join_room, emit, leave_room from flask.ext.sqlalchemy import get_debug_queries from lxml.html import fromstring, iterlinks, make_links_absolute from lxml.html.clean import Cleaner, autolink_html from markupsafe import Markup cleaner = Cleaner( style=True, links=True, add_nofollow=True, page_structure=True, safe_attrs_only=False, remove_tags=['p'] ) @lm.user_loader def load_user(id): return User.query.get(int(id)) @chat.before_request def before_request(): g.user = current_user if g.user.is_authenticated(): g.user.last_seen = datetime.utcnow()
normHtmlFile = lzma.open(options.outDir + "/" + options.prefix + "normalized_html.xz", "w") plainTextFile = lzma.open(options.outDir + "/" + options.prefix + "plain_text.xz", "w") # Boilerpipe cleaning is optional if options.boilerpipe: deboilFile = lzma.open(options.outDir + "/" + options.prefix + "deboilerplate_html.xz", "w") for record in f: # We convert into UTF8 first of all orig_encoding, text = convert_encoding(record.payload.read()) url = record.url if orig_encoding is None: logging.info("Encoding of document " + url + " could not be identified") if len(text) > 0: # HTML is then normalized cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) tree="" try: cleanhtml = cleaner.clean_html(re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE)) tree = ftfy.fix_text(cleanhtml, fix_entities=False, fix_character_width=False) #document = html5lib.parse(fixedtext, treebuilder="lxml", namespaceHTMLElements=False) #tree = etree.tostring(document, encoding="utf-8") except Exception as ex: sys.stderr.write(str(ex)+"\n") continue cleantree = tree.replace(" ", " ") cleantree = cleantree.replace("\t", " ") # lang id #printable_str = ''.join(x for x in cleantree if x in string.printable)
def cleanup(data,tags): cleaner= Cleaner(remove_tags=tags) clean=cleaner.clean_html(data) root = lxml.html.fromstring(clean) return root
def get_message_tree(self): tree = { 'id': self.get_msg_info(self.index.MSG_ID), 'tags': self.get_msg_info(self.index.MSG_TAGS).split(','), 'summary': self.get_msg_summary(), 'headers': {}, 'headers_lc': {}, 'attributes': {}, 'text_parts': [], 'html_parts': [], 'attachments': [], 'conversation': [], } conv_id = self.get_msg_info(self.index.MSG_CONV_ID) if conv_id: conv = Email(self.index, int(conv_id, 36)) tree['conversation'] = convs = [conv.get_msg_summary()] for rid in conv.get_msg_info(self.index.MSG_REPLIES).split(','): if rid: convs.append(Email(self.index, int(rid, 36)).get_msg_summary()) # FIXME: Decide if this is strict enough or too strict...? html_cleaner = Cleaner(page_structure=True, meta=True, links=True, javascript=True, scripts=True, frames=True, embedded=True, safe_attrs_only=True) msg = self.get_msg() for hdr in msg.keys(): tree['headers'][hdr] = self.index.hdr(msg, hdr) tree['headers_lc'][hdr.lower()] = self.index.hdr(msg, hdr) # Note: count algorithm must match that used in extract_attachment above count = 0 for part in msg.walk(): mimetype = part.get_content_type() if mimetype.startswith('multipart/'): continue count += 1 if (part.get('content-disposition', 'inline') == 'inline' and mimetype in ('text/plain', 'text/html')): payload, charset, openpgp = self.decode_payload(part) # FIXME: Do something with the openpgp data! if (mimetype == 'text/html' or '<html>' in payload or '</body>' in payload): tree['html_parts'].append({ 'openpgp_status': openpgp and openpgp[0] or '', 'openpgp_data': openpgp and openpgp[1] or '', 'charset': charset, 'type': 'html', 'data': (payload.strip() and html_cleaner.clean_html(payload)) or '' }) else: tree['text_parts'].extend(self.parse_text_part(payload, charset, openpgp)) else: tree['attachments'].append({ 'mimetype': mimetype, 'count': count, 'part': part, 'length': len(part.get_payload(None, True) or ''), 'content-id': part.get('content-id', ''), 'filename': part.get_filename() or '' }) if self.is_editable(): tree['is_editable'] = True tree['editing_string'] = self.get_editing_string(tree) return tree
def __init__(self, html): cleaner = Cleaner(style=True, page_structure=False, \ remove_tags=('br',), safe_attrs_only=False) self.html = cleaner.clean_html(html)
def handle(self, *args, **options): individuals = Individual.objects.all() for y1, y2 in year_ranges: url = url_pattern % (y1, y2, y1, y2) r = requests.get(url) r.encoding = "utf-8" output = r.text root = etree.HTML(output) dates = [ d.text for d in root.xpath( "//h2[@class=\"h3_style\"]/a[contains(@href,\"agenda\")]") ] tables = root.xpath("//table[@class=\"interlaced\"]") if len(dates) != len(tables): raise Exception("Dates and Questions Mismatch! %d <> %d" % (len(dates), len(tables))) for i in range(0, len(dates)): date = datetime.strptime(dates[i], '%d.%m.%Y') print date table = tables[i] for row in table.xpath(".//tr")[1:]: cells = row.xpath("td") if all_text(cells[3]).strip() == '-': continue legislator_name = cells[1].text if legislator_name.startswith(u"郭偉强"): legislator_name = u"郭偉強" title = all_text(cells[2]) question_type_text = all_text(cells[0]) individual = None for p in individuals: if legislator_name.startswith(p.name_ch): individual = p break if individual is None: print(legislator_name) raise Exception("Individual not found. ", legislator_name) link = cells[3].xpath(".//a")[0].attrib['href'] key = str(md5.new(link).hexdigest()) m = re.match(r"(.*[0-9]+|UQ)[\(]{0,1}(.*)\)", question_type_text) if m is None: raise Exception("Undefined Question Type", link, question_type_text) question_type = m.group(2) detail_r = requests.get(link) detail_r.encoding = "big5" output = detail_r.text cleaner = Cleaner(comments=False) output = cleaner.clean_html(output) detail_root = etree.HTML(output) try: press_release = all_text( detail_root.xpath("//div[@id=\"pressrelease\"]") [0]) except IndexError: detail_r = requests.get(link) detail_r.encoding = "utf-8" output = detail_r.text output = cleaner.clean_html(output) detail_root = etree.HTML(output) press_release = all_text( detail_root.xpath("//span[@id=\"pressrelease\"]") [0]) question_start = press_release.find(u'以下') reply_start = press_release.rfind(u'答覆:') question_text = press_release[question_start:reply_start] answer_text = press_release[reply_start + 3:] #print(question_text) #print(answer_text) #print link #print date #print individual.name_en #print key #print question_type question = Question() question.key = key question.individual = individual question.date = date question.question_type = question_type question.question = question_text question.answer = answer_text question.title = title question.link = link question.title_ch = title try: question.save() except IntegrityError: print("%s %s already exists" % (str(date), title))
cleaner = Cleaner( allow_tags=( "a", "img", "h1", "h2", "h3", "strong", "em", "b", "i", "sub", "sup", "p", "br", "hr", "pre", "div", "ul", "ol", "li", "table", "thead", "tbody", "tr", "th", "td", ), remove_unknown_tags=False, safe_attrs=set(["class", "href", "src", "alt"]), )
def get_best_next_url(*urls): """Returns the safest URL to redirect to from a given list.""" for url in urls: url = urljoin(settings.APP_UI_URL, url) if url and is_safe_url(url): return url return settings.APP_UI_URL CLEANER = Cleaner(style=True, meta=True, links=False, remove_tags=['body', 'form'], kill_tags=[ 'area', 'audio', 'base', 'bgsound', 'embed', 'frame', 'frameset', 'head', 'img', 'iframe', 'input', 'link', 'map', 'meta', 'nav', 'object', 'plaintext', 'track', 'video' ]) def sanitize_html(html_text, base_url): """Remove anything from the given HTML that must not show up in the UI.""" # TODO: circumvent encoding declarations? if html_text is None: return cleaned = CLEANER.clean_html(html_text) html = document_fromstring(cleaned) for (el, attr, href, _) in html.iterlinks(): href = normalize_href(href, base_url)
def serve(base_folder_path) -> Flask: app = Flask(__name__) app.jinja_env.trim_blocks = True app.jinja_env.lstrip_blocks = True recipe_parser = RecipeParser() recipe_serializer = RecipeSerializer() _cleaner = Cleaner(meta=True, embedded=True, links=True, style=True, processing_instructions=True, scripts=True, javascript=True, frames=True, remove_unknown_tags=True, page_structure=True, remove_tags=['body']) @app.context_processor def pjax_processor(): def get_root_template(): if "X-PJAX" in request.headers: return "pjax.html" return "structure.html" return dict(get_root_template=get_root_template) @app.template_filter() def markdown_to_cleaned_html(markdown): unsafe_html_str = commonmark.commonmark(markdown) # remove wrapping div # https://stackoverflow.com/questions/21420922/how-to-use-cleaner-lxml-html-without-returning-div-tag unsafe_doc = document_fromstring(unsafe_html_str) clean_doc = _cleaner.clean_html(unsafe_doc) clean_html_str = "\n".join( tostring(ch, encoding="unicode") for ch in clean_doc) return Markup(clean_html_str) @app.template_filter() def get_recipe_title(child_name: str, parent_path) -> str: absolute_path = os.path.join(base_folder_path, parent_path, child_name) if os.path.isdir(absolute_path): return Markup('<em>Folder</em>') try: with open(absolute_path, 'r', encoding='UTF-8') as f: recipe = recipe_parser.parse(f.read()) # TODO markdown to html return recipe.title except RuntimeError: return Markup('<strong>Invalid recipe!</strong>') @app.template_filter() def serialize_ingredients(ingredients: List[Ingredient]): return ("\n".join( recipe_serializer._serialize_ingredient(i, rounding=2) for i in ingredients)).strip() @app.route('/') @app.route('/<path:relative_path>') def download_file(relative_path=''): absolute_path = os.path.join(base_folder_path, relative_path) if os.path.isdir(absolute_path): if not absolute_path.endswith('/'): return redirect(f'/{relative_path}/', code=302) child_paths = [(ch, os.path.isdir(os.path.join(absolute_path, ch))) for ch in os.listdir(absolute_path)] child_paths = [ (ch, is_dir) for ch, is_dir in child_paths if not ch.startswith('.') and (is_dir or ch.endswith('.md')) ] child_paths = [ f'{ch}/' if not ch.endswith('/') and is_dir else ch for ch, is_dir in child_paths ] child_paths = sorted(child_paths) return render_template("folder.html", child_paths=child_paths, path=relative_path) if not absolute_path.endswith('.md'): return send_from_directory(base_folder_path, relative_path) with open(absolute_path, 'r', encoding='UTF-8') as f: required_yield_str = request.args.get('yield', '1') required_yield = recipe_parser.parse_amount(required_yield_str) if required_yield is None: required_yield = Amount(factor=Decimal(1)) src = f.read() try: recipe = recipe_parser.parse(src) except Exception as e: return render_template("markdown.html", markdown=src, path=relative_path, errors=[e.args[0]]) errors = [] try: recipe = get_recipe_with_yield(recipe, required_yield) except StopIteration: errors.append( f'The recipe does not specify a yield in the unit "{required_yield.unit}". ' f'The following units can be used: ' + ", ".join(f'"{y.unit}"' for y in recipe.yields)) except Exception as e: errors.append(str(e)) return render_template( "recipe.html", recipe=recipe, yields=recipe_serializer._serialize_yields(recipe.yields, rounding=2), tags=recipe_serializer._serialize_tags(recipe.tags), units=list(set(y.unit for y in recipe.yields)), default_yield=recipe_serializer._serialize_amount( recipe.yields[0]) if recipe.yields else "1", path=relative_path, errors=errors) return app
import logging import re from lxml import etree from lxml.html.clean import Cleaner from .filters import duplicate_test, textfilter from .settings import CUT_EMPTY_ELEMS, DEFAULT_CONFIG, MANUALLY_CLEANED, MANUALLY_STRIPPED from .utils import trim from .xpaths import COMMENTS_DISCARD_XPATH, DISCARD_XPATH LOGGER = logging.getLogger(__name__) # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False #HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
class HackathonManager(Component): """Component to manage hackathon Note that it only handle operations directly related to Hackathon table. Things like registerd users, templates are in separated components """ admin_manager = RequiredFeature("admin_manager") user_manager = RequiredFeature("user_manager") register_manager = RequiredFeature("register_manager") # basic xss prevention cleaner = Cleaner(safe_attrs=lxml.html.defs.safe_attrs | set(['style'])) # preserve style def is_hackathon_name_existed(self, name): """Check whether hackathon with specific name exists or not :type name: str|unicode :param name: name of hackathon :rtype: bool :return True if hackathon with specific name exists otherwise False """ hackathon = self.get_hackathon_by_name(name) return hackathon is not None def is_recycle_enabled(self, hackathon): key = HACKATHON_CONFIG.RECYCLE_ENABLED return self.get_basic_property(hackathon, key, False) def get_hackathon_by_name(self, name): """Get hackathon accoring the unique name :type name: str|unicode :param name: name of hackathon :rtype: Hackathon :return hackathon instance if found else None """ if not name: return None return Hackathon.objects(name=name).first() def get_hackathon_by_id(self, hackathon_id): """Query hackathon by id :type hackathon_id: str or ObjectId are both ok :param hackathon_id: _id of hackathon :return hackathon instance or None """ return Hackathon.objects(id=hackathon_id).first() def get_hackathon_detail(self, hackathon): user = None if self.user_manager.validate_login(): user = g.user return self.__get_hackathon_detail(hackathon, user) def get_hackathon_stat(self, hackathon): def internal_get_stat(): return self.__get_hackathon_stat(hackathon) cache_key = "hackathon_stat_%s" % hackathon.id return self.cache.get_cache(key=cache_key, createfunc=internal_get_stat) # TODO: implement HackathonStat related features: order_by == 'registered_users_num': def get_hackathon_list(self, args): # get values from request's QueryString page = int(args.get("page", 1)) per_page = int(args.get("per_page", 20)) order_by = args.get("order_by", "create_time") status = args.get("status") name = args.get("name") # build query by search conditions and order_by status_filter = Q() name_filter = Q() condition_filter = Q() order_by_condition = '-id' if status: status_filter = Q(status=status) if name: name_filter = Q(name__contains=name) if order_by == 'create_time': # 最新发布 order_by_condition = '-create_time' elif order_by == 'event_start_time': # 即将开始 order_by_condition = '-event_start_time' elif order_by == 'registered_users_num': # 人气热点 # hackathons with zero registered users would not be shown. hot_hackathon_stat = HackathonStat.objects( type=HACKATHON_STAT.REGISTER, count__gt=0).order_by('-count') hot_hackathon_list = [ stat.hackathon.id for stat in hot_hackathon_stat ] condition_filter = Q(id__in=hot_hackathon_list) else: order_by_condition = '-id' # perform db query with pagination pagination = Hackathon.objects(status_filter & name_filter & condition_filter).order_by( order_by_condition).paginate( page, per_page) hackathon_list = pagination.items hackathon_stat = HackathonStat.objects(hackathon__in=hackathon_list) user = None user_hackathon = [] team = [] if self.user_manager.validate_login(): user = g.user user_hackathon = UserHackathon.objects( user=user, hackathon__in=hackathon_list) team = Team.objects(members__user=user, hackathon__in=hackathon_list) def func(hackathon): return self.__fill_hackathon_detail(hackathon, user, hackathon_stat, user_hackathon, team) # return serializable items as well as total count return self.util.paginate(pagination, func) def get_online_hackathons(self): return Hackathon.objects(status=HACK_STATUS.ONLINE) def get_user_hackathon_list_with_detail(self, user_id): user_hackathon_rels = UserHackathon.objects( user=user_id, role=HACK_USER_TYPE.COMPETITOR).all() def get_user_hackathon_detail(user_hackathon_rel): dict = user_hackathon_rel.dic() dict["hackathon_info"] = user_hackathon_rel.hackathon.dic() return dict return [get_user_hackathon_detail(rel) for rel in user_hackathon_rels] def get_recyclable_hackathon_list(self): # todo filter hackathons by hackathon.config in a db-level if possible hackathons = Hackathon.objects( status=HACK_STATUS.ONLINE, event_start_time__lt=self.util.get_now(), event_end_time__gt=self.util.get_now()).all() return [h for h in hackathons if self.is_recycle_enabled(h)] def get_basic_property(self, hackathon, key, default=None): """Get basic property of hackathon from HackathonConfig""" if hackathon.config: return hackathon.config.get(key, default) return default def get_all_properties(self, hackathon): config = hackathon.config return config if config else {} def set_basic_property(self, hackathon, properties): """Set basic property in table HackathonConfig""" hackathon.config.update(properties) hackathon.save() self.cache.invalidate(self.__get_config_cache_key(hackathon)) return ok() def delete_basic_property(self, hackathon, keys): if isinstance(keys, str): keys = keys.split() list(map(lambda key: hackathon.config.pop(key, None), keys)) hackathon.save() self.cache.invalidate(self.__get_config_cache_key(hackathon)) return ok() def get_recycle_minutes(self, hackathon): key = HACKATHON_CONFIG.RECYCLE_MINUTES minutes = self.get_basic_property(hackathon, key, 60) return int(minutes) def validate_hackathon_name(self): if HTTP_HEADER.HACKATHON_NAME in request.headers: try: hackathon_name = request.headers[HTTP_HEADER.HACKATHON_NAME] hackathon = Hackathon.objects(name=hackathon_name).first() if hackathon: g.hackathon = hackathon return True else: self.log.debug("cannot find hackathon by name %s" % hackathon_name) return False except Exception as ex: self.log.error(ex) self.log.debug("hackathon_name invalid") return False else: self.log.debug("hackathon_name not found in headers") return False def create_new_hackathon(self, context): """Create new hackathon based on the http body Hackathon name is unique so duplicated names are not allowd. :type context: Context :param context: the body of http request that contains fields to create a new hackathon :rtype: dict """ if Hackathon.objects(name=context.name).count() > 0: raise PreconditionFailed("hackathon name already exists") self.log.debug("add a new hackathon:" + context.name) new_hack = self.__create_hackathon(g.user, context) self.create_hackathon_notice( new_hack.id, HACK_NOTICE_EVENT.HACK_CREATE, HACK_NOTICE_CATEGORY.HACKATHON) # hackathon create # init data is for local only if self.util.is_local(): self.__create_default_data_for_local(new_hack) return new_hack.dic() def update_hackathon(self, args): """Update hackathon properties :type args: dict :param args: arguments from http request body that contains properties with new values :rtype dict :return hackathon in dict if updated successfully. """ hackathon = g.hackathon try: update_items = self.__parse_update_items(args, hackathon) self.log.debug("update hackathon items :" + str(list(args.keys()))) if 'config' in update_items: self.set_basic_property(hackathon, update_items.get('config', {})) update_items.pop('config', None) # basic xss prevention if 'description' in update_items and update_items['description']: #update_items['description'] = self.cleaner.clean_html(update_items['description']) self.log.debug("hackathon description :" + update_items['description']) hackathon.modify(**update_items) hackathon.save() return ok() except Exception as e: self.log.error(e) return internal_server_error("fail to update hackathon") def delete_hackathon(self): """delete hackathon :return hackathon in dict if updated successfully. """ hackathon = g.hackathon try: UserHackathon.objects(hackathon=hackathon).delete() self.log.debug("delete hackathon:" + hackathon.name) hackathon.delete() hackathon.save() return ok() except Exception as e: self.log.error(e) return internal_server_error("fail to delete hackathon" + hackathon.name) def apply_online_hackathon(self, hackathon): """apply for onlining a hackathon, should be called by the hackathon creator :return hackathon in dict if updated successfully. """ try: req = ok() if hackathon.status == HACK_STATUS.OFFLINE or hackathon.status == HACK_STATUS.DRAFT: hackathon.status = HACK_STATUS.APPLY_ONLINE hackathon.save() elif hackathon.status == HACK_STATUS.INIT: req = general_error(code=HTTP_CODE.CREATE_NOT_FINISHED) return req except Exception as e: self.log.error(e) return internal_server_error("fail to delete hackathon" + hackathon.name) def get_userlike_all_hackathon(self, user_id): user_hackathon_rels = UserHackathon.objects(user=user_id).all() def get_user_hackathon_detail(user_hackathon_rel): dict = user_hackathon_rel.dic() dict["hackathon_info"] = user_hackathon_rel.hackathon.dic() return dict return [get_user_hackathon_detail(rel) for rel in user_hackathon_rels] def like_hackathon(self, user, hackathon): user_hackathon = UserHackathon.objects(hackathon=hackathon, user=user).first() if user_hackathon and user_hackathon.like: return ok() if not user_hackathon: user_hackathon = UserHackathon(hackathon=hackathon, user=user, role=HACK_USER_TYPE.VISITOR, status=HACK_USER_STATUS.UNAUDIT, like=True, remark="") user_hackathon.save() if not user_hackathon.like: user_hackathon.like = True user_hackathon.save() # increase the count of users that like this hackathon self.increase_hackathon_stat(hackathon, HACKATHON_STAT.LIKE, 1) return ok() def unlike_hackathon(self, user, hackathon): user_hackathon = UserHackathon.objects(user=user, hackathon=hackathon).first() if user_hackathon: user_hackathon.like = False user_hackathon.save() # sync the like count like_count = UserHackathon.objects(hackathon=hackathon, like=True).count() self.update_hackathon_stat(hackathon, HACKATHON_STAT.LIKE, like_count) return ok() def update_hackathon_stat(self, hackathon, stat_type, count): """Increase or descrease the count for certain hackathon stat :type hackathon: Hackathon :param hackathon: instance of Hackathon to be counted :type stat_type: str|unicode :param stat_type: type of stat that defined in constants.py#HACKATHON_STAT :type count: int :param count: the new count for this stat item """ stat = HackathonStat.objects(hackathon=hackathon, type=stat_type).first() if stat: stat.count = count stat.update_time = self.util.get_now() else: stat = HackathonStat(hackathon=hackathon, type=stat_type, count=count) if stat.count < 0: stat.count = 0 stat.save() def increase_hackathon_stat(self, hackathon, stat_type, increase): """Increase or descrease the count for certain hackathon stat :type hackathon: Hackathon :param hackathon: instance of Hackathon to be counted :type stat_type: str|unicode :param stat_type: type of stat that defined in constants.py#HACKATHON_STAT :type increase: int :param increase: increase of the count. Can be positive or negative """ stat = HackathonStat.objects(hackathon=hackathon, type=stat_type).first() if stat: stat.count += increase else: stat = HackathonStat(hackathon=hackathon, type=stat_type, count=increase) if stat.count < 0: stat.count = 0 stat.update_time = self.util.get_now() stat.save() def get_distinct_tags(self): """Return all distinct hackathon tags for auto-complete usage""" return self.db.session().query(HackathonTag.tag).distinct().all() def create_hackathon_organizer(self, hackathon, body): organizer = Organization(id=uuid.uuid4(), name=body.name, description=body.get("description", ""), organization_type=body.organization_type, homepage=body.get("homepage", ""), logo=body.get("logo", "")) hackathon.organizers.append(organizer) hackathon.update_time = self.util.get_now() hackathon.save() return hackathon.dic() def update_hackathon_organizer(self, hackathon, body): organizer = hackathon.organizers.get(id=body.id) if not organizer: return not_found() organizer.name = body.get("name", organizer.name) organizer.description = body.get("description", organizer.description) organizer.homepage = body.get("homepage", organizer.homepage) organizer.logo = body.get("logo", organizer.logo) organizer.organization_type = body.get("organization_type", organizer.organization_type) hackathon.update_time = self.util.get_now() hackathon.save() return hackathon.dic() def delete_hackathon_organizer(self, hackathon, organizer_id): if hackathon.organizers.filter(id=organizer_id): hackathon.update(pull__organizers=hackathon.organizers.get( id=organizer_id)) hackathon.update_time = self.util.get_now() hackathon.save() return ok() def create_hackathon_award(self, hackathon, body): level = int(body.get("level")) if level > 10: level = 10 award = Award(id=uuid.uuid4(), name=body.get("name"), sub_name=body.get("sub_name"), description=body.get("description"), level=level, quota=body.get("quota"), award_url=body.get("award_url")) hackathon.update(push__awards=award) hackathon.update_time = self.util.get_now() hackathon.save() return ok() def update_hackathon_award(self, hackathon, body): award = hackathon.awards.get(id=body.get("id")) if not award: return not_found("award not found") level = award.level if "level" in body: level = int(body.get("level")) if level > 10: level = 10 award.name = body.get("name", award.name) award.sub_name = body.get("sub_name", award.sub_name) award.description = body.get("description", award.description) award.level = body.get("level", level) award.quota = body.get("quota", award.quota) award.award_url = body.get("award_url", award.award_url) award.save() hackathon.update_time = self.util.get_now() hackathon.save() return ok() def delete_hackathon_award(self, hackathon, award_id): award = hackathon.awards.get(id=award_id) hackathon.update(pull__awards=award) hackathon.update_time = self.util.get_now() hackathon.save() # delete granted award in teams award_uuid = uuid.UUID(award_id) Team.objects(hackathon=hackathon, awards=award_uuid).update(pull__awards=award_uuid) return ok() def list_hackathon_awards(self, hackathon): awards = hackathon.dic()["awards"] awards.sort(key=lambda award: -award["level"]) return awards def get_hackathon_notice(self, notice_id): hackathon_notice = HackathonNotice.objects(id=notice_id).first() if not hackathon_notice: return not_found("hackathon_notice not found") return hackathon_notice.dic() def create_hackathon_notice(self, hackathon_id, notice_event, notice_category, body={}): """ create hackathon notice with hackathon_id, notice_event, notice_category. notice 'content' and 'link' can be included in body (optional) :type hackathon_id: int :param hackathon_id: id of hackathon that the notice belongs to (-1 if the notice doesn't belong to a specfic hackathon) :type notice_event: Class HACK_NOTICE_EVENT :param notice_event: event that the notice is triggered by, used for notice filtering (see get_hackathon_notice_list()) more specfic than notice_category, new events can be added without disturbing front-end code :type notice_category: Class HACK_NOTICE_CATEGORY :param notice_category: category that the notice belongs to, used for notice filtering and notice properties display at front-end (e.g. icons/descriptions, see oh.manage.notice.js & oh.site.hackathon.js), more general than notice_event, if you want to add a new category in HACK_NOTICE_CATEGORY, remember to update front-end js code as well. :type body: dict/Context, default value: {} :param body: other necessary information, e.g.: 'content'(notice's content), 'link'(notice's link), other keys for specfic uses :return: hackathon_notice in dict ::Example: :create_hackathon_notice(2, HACK_NOTICE_EVENT.xx, HACK_NOTICE_CATEGORY.yy, {'content': 'zz'}) a new notice for a hackathon with id 2 is created for the propose of HACK_NOTICE_EVENT.xx. The notice's front-end icon and description is determined by HACK_NOTICE_CATEGORY.yy, while its content is 'zz' and its link url is '' :create_hackathon_notice(-1, HACK_NOTICE_EVENT.xx, HACK_NOTICE_CATEGORY.yy) a new notice not belongs to any hackathon is created for the propose of HACK_NOTICE_EVENT.xx. The notice's front-end icon and description is determined by HACK_NOTICE_CATEGORY.yy, while its content and link url is '' """ hackathon_notice = HackathonNotice(content='', link='', event=notice_event, category=notice_category) hackathon = self.get_hackathon_by_id(hackathon_id) if hackathon: hackathon_notice.hackathon = hackathon # notice creation logic for different notice_events if hackathon: if notice_event == HACK_NOTICE_EVENT.HACK_CREATE: hackathon_notice.content = "%s即将火爆来袭,敬请期待!" % ( hackathon.display_name) # elif notice_event == HACK_NOTICE_EVENT.HACK_EDIT and hackathon: # hackathon_notice.content = u"%s更新啦,快去看看!" % (hackathon.display_name) elif notice_event == HACK_NOTICE_EVENT.HACK_ONLINE: hackathon_notice.content = "%s开始啦,点击报名!" % ( hackathon.display_name) hackathon_notice.link = "/site/%s" % hackathon.name elif notice_event == HACK_NOTICE_EVENT.HACK_OFFLINE: hackathon_notice.content = "%s圆满结束,点击查看详情!" % ( hackathon.display_name) hackathon_notice.link = "/site/%s" % hackathon.name elif notice_event == HACK_NOTICE_EVENT.HACK_PLAN and body.get( 'receiver', None): user = body.get('receiver') old_hackathon_notice = HackathonNotice.objects( receiver=user, event=HACK_NOTICE_EVENT.HACK_PLAN, hackathon=hackathon).first() if old_hackathon_notice: # duplicate return old_hackathon_notice.dic() hackathon_notice.content = "您有未完成的任务,请提交开发说明书" hackathon_notice.receiver = user hackathon_notice.link = "/site/%s/team" % (hackathon.name) else: pass if notice_event == HACK_NOTICE_EVENT.EXPR_JOIN and body.get('user_id'): user_id = body.get('user_id') user = self.user_manager.get_user_by_id(user_id) hackathon_notice.content = "用户 %s 开始编程" % (user.nickname) else: pass # use assigned value if content or link is assigned in body hackathon_notice.content = body.get('content', hackathon_notice.content) hackathon_notice.link = body.get('link', hackathon_notice.link) hackathon_notice.save(validate=False) self.log.debug( "a new notice is created: hackathon: %s, event: %d, category: %d" % (hackathon.name, notice_event, notice_category)) return hackathon_notice.dic() def update_hackathon_notice(self, body): hackathon_notice = HackathonNotice.objects(id=body.get('id')).first() if not hackathon_notice: return not_found("hackathon_notice not found") hackathon_notice.content = body.get("content", hackathon_notice.content) hackathon_notice.link = body.get("link", hackathon_notice.link) hackathon_notice.category = body.get("category", hackathon_notice.category) hackathon_notice.update_time = self.util.get_now() hackathon_notice.save(validate=False) return hackathon_notice.dic() def delete_hackathon_notice(self, notice_id): hackathon_notice = HackathonNotice.objects(id=notice_id).first() if not hackathon_notice: return not_found('Hackathon notice not found') hackathon_notice.delete() return ok() def get_hackathon_notice_list(self, body): """ list hackathon notices, notices are paginated, can be filtered by hackathon_name, event and category, can be ordered by update_time, event and category. :type body: Context :param body: valid key/values(all key/values are optional) body = { hackathon_name: string, // filter by hackathon_name, default unfiltered filter_by_user: '******' | 'all', // filter by user, default filter all notice that has specfic receivers category: 'int[,int...]', // filter by category, default unfiltered event: 'int[,int...]', // filter by event, default unfiltered order_by: 'time' | 'event' | 'category', // order by update_time, event, category, default by time page: int, // page number after pagination, start from 1, default 1 per_page: int // items per page, default 1000 } :return: json style text, see util.Utility ::Example: : body = { order_by: 'time', category: '1,2,3', page: 1, per_page: 6 } search first 6 notices ordered by time, filtered by: category in [1,2,3] : body = { hackathon_name: 'hackathon', event: '1', order_by: 'event' } search first 1000 notices ordered by event, filtered by event == 1 and hackathon_name == 'hackathon' """ hackathon_name = body.get("hackathon_name") filter_by_user = body.get("filter_by_user", "") notice_category = body.get("category") notice_event = body.get("event") order_by = body.get("order_by", "time") page = int(body.get("page", 1)) per_page = int(body.get("per_page", 1000)) hackathon_filter = Q() category_filter = Q() event_filter = Q() user_filter = Q(receiver=None) is_read_filter = Q() order_by_condition = '-update_time' if hackathon_name: #list notices that belong to specfic hackathon hackathon = Hackathon.objects( name=hackathon_name).only('name').first() if hackathon: hackathon_filter = Q(hackathon=hackathon) else: return not_found('hackathon_name not found') else: #only list online hackathons' notices or notices that not belong to any hackathon online_hackathon = Hackathon.objects(status=HACK_STATUS.ONLINE) hackathon_filter = Q(hackathon__in=online_hackathon) | Q( hackathon=None) if filter_by_user: # only return notices that are sent to the login user user = None if self.user_manager.validate_login(): user = g.user user_filter = Q(receiver=user) if filter_by_user == 'unread': is_read_filter = Q(is_read=False) else: return bad_request("please login first") if notice_category: notice_category_tuple = tuple( [int(category) for category in notice_category.split(',')]) category_filter = Q(category__in=notice_category_tuple) if notice_event: notice_event_tuple = tuple( [int(event) for event in notice_event.split(',')]) event_filter = Q(event__in=notice_event_tuple) if order_by == 'category': order_by_condition = '+category' elif order_by == 'event': order_by_condition = '+event' else: order_by_condition = '-update_time' pagination = HackathonNotice.objects( hackathon_filter & category_filter & event_filter & user_filter & is_read_filter).order_by(order_by_condition).paginate( page, per_page) def func(hackathon_notice): return hackathon_notice.dic() # return serializable items as well as total count return self.util.paginate(pagination, func) def check_notice_and_set_read_if_necessary(self, id): hackathon_notice = HackathonNotice.objects(id=id).first() if hackathon_notice: user = g.user if not user or user.id != hackathon_notice.receiver.id: # not the user return ok() hackathon_notice.is_read = True if hackathon_notice.event == HACK_NOTICE_EVENT.HACK_PLAN: # set is_read = True if dev_plan is complete user = hackathon_notice.receiver hackathon = hackathon_notice.hackathon team = Team.objects(members__user=user, hackathon=hackathon).first() if team: if not team.dev_plan: # the dev_plan isn't submitted hackathon_notice.is_read = False hackathon_notice.save() return ok() def schedule_pre_allocate_expr_job(self): """Add an interval schedule job to check all hackathons""" next_run_time = self.util.get_now() + timedelta(seconds=3) self.scheduler.add_interval( feature="hackathon_manager", method="check_hackathon_for_pre_allocate_expr", id="check_hackathon_for_pre_allocate_expr", next_run_time=next_run_time, minutes=20) def __is_pre_allocate_enabled(self, hackathon): if hackathon.event_end_time < self.util.get_now(): return False # using registration time for better test before event_start_time if hackathon.registration_start_time > self.util.get_now(): return False if hackathon.status != HACK_STATUS.ONLINE: return False if hackathon.config.get(HACKATHON_CONFIG.CLOUD_PROVIDER, CLOUD_PROVIDER.NONE) == CLOUD_PROVIDER.NONE: return False return hackathon.config.get(HACKATHON_CONFIG.PRE_ALLOCATE_ENABLED, False) def check_hackathon_for_pre_allocate_expr(self): """Check all hackathon for pre-allocate Add an interval job for hackathon if it's pre-allocate is enabled. Otherwise try to remove the schedule job """ hackathon_list = Hackathon.objects() for hack in hackathon_list: job_id = "pre_allocate_expr_" + str(hack.id) is_job_exists = self.scheduler.has_job(job_id) if self.__is_pre_allocate_enabled(hack): if is_job_exists: self.log.debug( "pre_allocate job already exists for hackathon %s" % str(hack.name)) continue self.log.debug("add pre_allocate job for hackathon %s" % str(hack.name)) next_run_time = self.util.get_now() + timedelta( seconds=(20 * random.random())) pre_allocate_interval = self.__get_pre_allocate_interval(hack) self.scheduler.add_interval( feature="expr_manager", method="pre_allocate_expr", id=job_id, context=Context(hackathon_id=hack.id), next_run_time=next_run_time, seconds=pre_allocate_interval) elif is_job_exists: self.log.debug( "remove job for hackathon %s since pre_allocate is disabled" % str(hack.id)) self.scheduler.remove_job(job_id) return True def hackathon_online(self, hackathon): req = ok() if hackathon.status == HACK_STATUS.DRAFT or hackathon.status == HACK_STATUS.OFFLINE or hackathon.status == HACK_STATUS.APPLY_ONLINE: if self.util.is_local() or hackathon.config.get( 'cloud_provider') == CLOUD_PROVIDER.NONE: req = ok() elif hackathon.config.get( 'cloud_provider') == CLOUD_PROVIDER.AZURE: raise NotImplementedError() elif hackathon.status == HACK_STATUS.ONLINE: req = ok() else: req = general_error(code=HTTP_CODE.CREATE_NOT_FINISHED) if req.get('error') is None: hackathon.status = HACK_STATUS.ONLINE hackathon.save() self.create_hackathon_notice( hackathon.id, HACK_NOTICE_EVENT.HACK_ONLINE, HACK_NOTICE_CATEGORY.HACKATHON) # hackathon online return req def hackathon_offline(self, hackathon): req = ok() if hackathon.status == HACK_STATUS.ONLINE or hackathon.status == HACK_STATUS.DRAFT or hackathon.status == HACK_STATUS.APPLY_ONLINE: hackathon.status = HACK_STATUS.OFFLINE hackathon.save() self.create_hackathon_notice( hackathon.id, HACK_NOTICE_EVENT.HACK_OFFLINE, HACK_NOTICE_CATEGORY.HACKATHON) # hackathon offline elif hackathon.status == HACK_STATUS.INIT: req = general_error(code=HTTP_CODE.CREATE_NOT_FINISHED) return req # TODO: we need to review those commented items one by one to decide the API output def __get_hackathon_detail(self, hackathon, user=None): """Return hackathon info as well as its details including configs, stat, organizers, like if user logon""" detail = hackathon.dic() detail["stat"] = {"register": 0, "like": 0} for stat in HackathonStat.objects(hackathon=hackathon): if stat.type == HACKATHON_STAT.REGISTER: detail["stat"]["register"] = stat.count elif stat.type == HACKATHON_STAT.LIKE: detail["stat"]["like"] = stat.count if user: user_hackathon = UserHackathon.objects(hackathon=hackathon, user=user).first() if user_hackathon and user_hackathon.like: detail['like'] = user_hackathon.like detail["user"] = self.user_manager.user_display_info(user) detail["user"]["is_admin"] = user.is_super or ( user_hackathon and user_hackathon.role == HACK_USER_TYPE.ADMIN) # TODO: we need to review those items one by one to decide the API output # asset = self.db.find_all_objects_by(UserHackathonAsset, user_id=user.id, hackathon_id=hackathon.id) # if asset: # detail["asset"] = [o.dic() for o in asset] if user_hackathon and user_hackathon.role == HACK_USER_TYPE.COMPETITOR: detail["registration"] = user_hackathon.dic() team = Team.objects(hackathon=hackathon, members__user=user).first() if team: detail["team"] = team.dic() return detail def __fill_hackathon_detail(self, hackathon, user, hackathon_stat, user_hackathon, team): """Return hackathon info as well as its details including configs, stat, organizers, like if user logon""" detail = hackathon.dic() detail["stat"] = {"register": 0, "like": 0} for stat in hackathon_stat: if stat.type == HACKATHON_STAT.REGISTER and stat.hackathon.id == hackathon.id: detail["stat"]["register"] = stat.count elif stat.type == HACKATHON_STAT.LIKE and stat.hackathon.id == hackathon.id: detail["stat"]["like"] = stat.count if user: detail['user'] = self.user_manager.user_display_info(user) detail['user']['admin'] = user.is_super if user_hackathon: for uh in user_hackathon: if uh.hackathon.id == hackathon.id: detail['user']['admin'] = detail['user']['admin'] or ( uh.role == HACK_USER_TYPE.ADMIN) if uh.like: detail['like'] = uh.like if uh.role == HACK_USER_TYPE.COMPETITOR: detail['registration'] = uh.dic() for t in team: if t.hackathon.id == hackathon.id: detail['team'] = t.dic() break break return detail def __create_hackathon(self, creator, context): """Insert hackathon and creator(admin of course) to database We enforce that default config are used during the creation :type context: Context :param context: context of the args to create a new hackathon :rtype: Hackathon :return hackathon instance """ new_hack = Hackathon( name=context.name, display_name=context.display_name, ribbon=context.get("ribbon"), description=context.get("description"), short_description=context.get("short_description"), location=context.get("location"), banners=context.get("banners", []), status=HACK_STATUS.INIT, creator=creator, type=context.get("type", HACK_TYPE.HACKATHON), config=context.get("config", Context()).to_dict(), tags=context.get("tags", []), event_start_time=context.get("event_start_time"), event_end_time=context.get("event_end_time"), registration_start_time=context.get("registration_start_time"), registration_end_time=context.get("registration_end_time"), judge_start_time=context.get("judge_start_time"), judge_end_time=context.get("judge_end_time")) # basic xss prevention if new_hack.description: # case None type new_hack.description = self.cleaner.clean_html( new_hack.description) new_hack.save() # add the current login user as admin and creator try: admin = UserHackathon(user=creator, hackathon=new_hack, role=HACK_USER_TYPE.ADMIN, status=HACK_USER_STATUS.AUTO_PASSED, remark='creator') admin.save() except Exception as ex: # TODO: send out a email to remind administrator to deal with this problems self.log.error(ex) raise InternalServerError( "fail to create the default administrator") return new_hack def __get_pre_allocate_interval(self, hackathon): interval = self.get_basic_property( hackathon, HACKATHON_CONFIG.PRE_ALLOCATE_INTERVAL_SECONDS) if interval: return int(interval) else: return 300 + random.random() * 50 def __get_hackathon_configs(self, hackathon): def __internal_get_config(): configs = {} for c in hackathon.configs.all(): configs[c.key] = c.value return configs cache_key = self.__get_config_cache_key(hackathon) return self.cache.get_cache(key=cache_key, createfunc=__internal_get_config) def __get_hackathon_organizers(self, hackathon): organizers = self.db.find_all_objects_by(HackathonOrganizer, hackathon_id=hackathon.id) return [o.dic() for o in organizers] def __parse_update_items(self, args, hackathon): """Parse properties that need to update Only those whose value changed items will be returned. Also some static property like id, name, create_time and unexisted properties should NOT be updated. :type args: dict :param args: arguments from http body which contains new values :type hackathon: Hackathon :param hackathon: the existing Hackathon object which contains old values :rtype: dict :return a dict that contains all properties that are updated. """ result = {} hackathon_dic = hackathon.dic() for key in dict(args): if hasattr(hackathon, key) and (key not in hackathon_dic or dict(args)[key] != hackathon_dic[key]): result[key] = dict(args)[key] result.pop('id', None) result.pop('name', None) result.pop('creator', None) result.pop('create_time', None) result['update_time'] = self.util.get_now() return result def __get_hackathon_stat(self, hackathon): stats = HackathonStat.objects(hackathon=hackathon).all() result = {"hackathon_id": str(hackathon.id), "online": 0, "offline": 0} for item in stats: result[item.type] = item.count reg_list = UserHackathon.objects( hackathon=hackathon, role=HACK_USER_TYPE.COMPETITOR, deleted=False, status__in=[ HACK_USER_STATUS.AUTO_PASSED, HACK_USER_STATUS.AUDIT_PASSED ]).only("user").no_dereference().all() reg_list = [uh.user.id for uh in reg_list] reg_count = len(reg_list) if reg_count > 0: online_count = User.objects(id__in=reg_list, online=True).count() result["online"] = online_count result["offline"] = reg_count - online_count return result def __get_config_cache_key(self, hackathon): return "hackathon_config_%s" % hackathon.id def __create_default_data_for_local(self, hackathon): """ create test data for new hackathon. It's for local development only :param hackathon: :return: """ try: # test docker host server host = DockerHostServer(vm_name="localhost", public_dns="localhost", public_ip="127.0.0.1", public_docker_api_port=4243, private_ip="127.0.0.1", private_docker_api_port=4243, container_count=0, container_max_count=100, disabled=False, state=DockerHostServerStatus.DOCKER_READY, hackathon=hackathon) host.save() except Exception as e: self.log.error(e) self.log.warn("fail to create test data")
from bs4 import BeautifulSoup from bs4.element import Tag import re from readability import htmls from extractor.util import load_log_config logging.config.dictConfig(load_log_config()) logger = logging.getLogger('applog.' + __name__) # Handle jQuery Lazy Load Plugin IMAGE_URL_KEYS = ('src', 'data-lazy-src', 'data-original',) cleaner = Cleaner( scripts=True, javascript=True, style=True, comments=True, forms=False, links=True, processing_instructions=True, kill_tags = ['footer', 'nav', 'select', 'button', 'noscript',], ) class Parser(): __REGEX_TITLE_ATTR = re.compile('title', re.IGNORECASE) __REGEX_NOT_TITLE_ATTR = re.compile('sub|side|related', re.IGNORECASE) def __init__(self, type_='lxml', html_string=''): if type_ == 'lxml': self._parser = LxmlParser(html_string) elif type_ == 'soup': self._parser = SoupParser(html_string) else: raise ValueError('parser type must be lxml or soup')
def display_raw(html): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True html = cleaner.clean_html(html) return display_html(html, raw=True)
class HtmlIngestor(DocumentIngestor): MIME_TYPES = ['text/html'] EXTENSIONS = ['html', 'htm', 'asp', 'aspx', 'jsp'] cleaner = Cleaner(scripts=True, javascript=True, style=True, links=True, embedded=True, forms=True, frames=True, annoying_tags=True, meta=False, remove_tags=['a']) def generate_pdf_version(self, html_path): """OK, this is weirder. Converting HTML to PDF via WebKit.""" fh, out_path = mkstemp(suffix='.pdf') os.close(fh) wkhtmltopdf = get_config('WKHTMLTOPDF_BIN') args = [ wkhtmltopdf, '--disable-javascript', '--no-outline', '--no-images', '--quiet', html_path, out_path ] subprocess.call(args) return out_path def ingest(self, meta, local_path): fh, out_path = mkstemp(suffix='.htm') os.close(fh) with open(local_path, 'rb') as fh: data = fh.read() doc = html.fromstring(data) if not meta.has('title'): title = doc.findtext('.//title') if title is not None: meta.title = title.strip() if not meta.has('summary'): summary = doc.find('.//meta[@name="description"]') if summary is not None and summary.get('content'): meta.summary = summary.get('content') for field in ['keywords', 'news_keywords']: value = doc.find('.//meta[@name="%s"]' % field) if value is not None: value = value.get('content') or '' for keyword in value.split(','): meta.add_keyword(keyword.strip()) self.cleaner(doc) try: with open(out_path, 'w') as fh: fh.write(etree.tostring(doc)) pdf_path = self.generate_pdf_version(out_path) if pdf_path is None or not os.path.isfile(pdf_path): raise IngestorException("Could not convert document: %r", meta) self.extract_pdf_alternative(meta, pdf_path) finally: if os.path.isfile(out_path): os.unlink(out_path)