Esempio n. 1
0
import pickle
import os.path
from pprint import pprint
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import as_completed
from lxml.html.clean import Cleaner
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from collections import Counter

not_word_chars = re.compile(r'\s+|[^\w]|\s+[^\w]*|[^A-Za-z]')
stemmer = SnowballStemmer('english')
cleaner = Cleaner(scripts=True,
                  javascript=True,
                  comments=True,
                  style=True,
                  embedded=True,
                  forms=True,
                  annoying_tags=True)
num_docs = 0


class Document(object):
    def __init__(self, path, tokens, counter):
        self.path = path
        self.tokens = tokens
        self.counter = counter

    @property
    def doc_length(self):
        return math.sqrt(
Esempio n. 2
0
from nxs_utils import ThreadPool, Timer
from subprocess import call
import subprocess
import seleniumclient
import xml.etree.ElementTree as ET
import re, lxml
from lxml.html.clean import Cleaner

WORKERS = 1

siteBase = "https://bed-search.nextprot.org/"
sitemapUrl = siteBase + "sitemap.xml"
#Where to save static site
dirlocation = "/work/tmp/static-site/"

cleaner = Cleaner()
#cleaner.scripts = True # This is True because we want to activate the javascript filter
cleaner.scripts = True  # This is True because we want to activate the javascript filter


def saveToFile(content, filename):
    text_file = open(filename, "w")
    text_file.write(content.encode('UTF-8'))
    text_file.close()
    print str(incrementCounter()) + " creating file " + filename + " "
    sys.stdout.flush()


def createDirectoryStructureIfNeeded(URLS):
    for url in URLS:
        filename = getFilename(url)
Esempio n. 3
0
def scrape_links(links):
    maincleaner = Cleaner(allow_tags=['div'],
                          remove_unknown_tags=False,
                          remove_tags=['div'])  # funtion to remove every tag

    #    while True:
    for link in links:  # Loop through all the links
        if link == last_link:  # Check if this link has already been scraped (this will eventually be changed to check dates)
            break  # If we've hit something we've already scraped, break out of the loop
#        try:
        linkhtml = scraperwiki.scrape(link).decode(
            'Windows-1252'
        )  # scrape the contents of the current link and decode from Windows-1252 encoding
        print link
        root = lxml.html.fromstring(
            linkhtml)  # turn scraped content into an HTML object

        # GET TITLE
        title = root.cssselect("h1")[0].text.encode(
            'utf-8'
        )  # grab the page header (title) and return its text as unicode
        title = replace_all(
            title, subDic)  # replace alphanumeric obfuscations with letters

        # GET DATE
        date = root.cssselect(
            "div.adInfo"
        )[0].text  # get the text of the html entity that contains the date and time of the post
        cleandate = re.sub(r'(\S+\s+\d+,\s+\d\d\d\d)(?:,?) (\d+\:\d+ \w\w)',
                           r'\1 \2',
                           date.strip())  # get date into a standard format
        cleandate = re.search(r'\S+\s+\d+, \d\d\d\d \d+\:\d+ \w\w',
                              cleandate).group(
                                  0)  # find the date string on the page
        rawdate = datetime.strptime(
            cleandate, '%B %d, %Y %I:%M %p'
        )  # encode the date as a date using format Month dd, YYYY
        date = rawdate.strftime(
            '%Y-%m-%d %H:%M'
        )  # decode that date back into a string of format YYYY-mm-dd

        # GET MAIN BODY TEXT
        mainwithtags = root.cssselect("div.postingBody")[
            0]  # grabs the body text of the post
        main = maincleaner.clean_html(mainwithtags).text.encode(
            'utf-8')  # gets rid of all HTML tags
        main = replace_all(
            main, subDic)  # replace alphanumeric obfuscations with letters

        # GET PHONE NUMBER(S)
        stripped = replace_all(
            main.lower(), wordDic
        )  # replaces common phone number obfuscations with actual numbers
        phonecomp = re.compile("[\s\-/=\.,{}_\!\@\#\$\%\^\&\*\(\)\~]"
                               )  # list of known phone number dividers
        stripped = phonecomp.sub('', stripped)  # remove phone number dividers
        phone = re.findall(
            r'(?:1?)[1-9]\d{9}', stripped
        )  # search for groups of 10 consecutive numbers (with an optional preceding 1)
        phone = list(
            set(phone)
        )  # gets rid of duplicate numbers by turning list into a set and back
        phone = ", ".join(
            phone)  # formats phone numbers as "phone1, phone2,... phoneN"

        # GET LISTED AGE
        if root.cssselect(
                "p.metaInfoDisplay"):  # does the entry have metainfo?
            listedage = root.cssselect("p.metaInfoDisplay")[
                0]  # get the the first html metainfo element
            listedage = re.sub(
                "[^\d]", "", listedage.text
            )  # get rid of all non-numeric text in the text of the element
        else:  # if there's no metainfo
            listedage = ""  # set the listed age to an empty string

        # GET LOCATION
        if re.findall(r'Location\:(.*?)\</div\>', linkhtml,
                      flags=re.DOTALL):  #
            location = re.findall('Location\:(.*?)\</div\>',
                                  linkhtml,
                                  flags=re.DOTALL)[0].encode('utf-8')
            #            location = removeNonAscii(location)
            print repr(location)
        else:
            location = ""

        picturelist = []
        pictures = root.cssselect('ul#viewAdPhotoLayout img')
        for i in range(len(pictures)):
            largepic = re.sub('/medium/', '/large/', pictures[i].get('src'))
            picturelist.append(largepic)
        print picturelist
        picturelist = " ".join(picturelist)
        x = urllib.urlopen(largepic).read()
        piccode = base64.encodestring(x)
        print piccode

        #        except:
        #            print 'FAILED TO LOAD: ' + link
        #        continue
        #            record = {}
        #            record['Title'] = 'LOAD FAILURE'
        # Set up our data record - we'll need it later

        record = {}
        record['Title'] = title  #.encode('ascii', 'ignore').strip()
        record['Date'] = date
        record['Main'] = main  #.encode('ascii', 'ignore').strip()
        record['Pictures'] = picturelist
        record['Phone'] = phone
        record['Listed Age'] = listedage  #.encode('ascii', 'ignore').strip()
        record['Location'] = location
        record['PicCode'] = piccode  #.encode('ascii', 'ignore').strip()
        # Print out the data we've gathered
        #print record, '------------'
        # Finally, save the record to the datastore - 'Artist' is our unique key
        scraperwiki.sqlite.save(["Title"], record)
Esempio n. 4
0
    body.rewrite_links(myRewriteLink) 
    f = open(fileout,"wb")
    f.write(html.tostring(body))
    f.close()
    


if len(sys.argv) != 3:
    usage()

dirin = sys.argv[1]
dirout= sys.argv[2]


cleaner = Cleaner(style=True)

import zipfile

with zipfile.ZipFile(dirin,'r') as myzip:
    for orig in myzip.namelist():
        print(orig)
        dest = os.path.join(dirout,orig)
        if orig.endswith('/'):
            os.makedirs(dest,exist_ok=True) 
        elif orig.endswith(".html"):
            f = myzip.read(orig)
            convert(f,dest)
        else :
            myzip.extract(orig,dirout)
Esempio n. 5
0
    using_sysrandom = True
except NotImplementedError:

    using_sysrandom = False

SECRET = random.randint(0, 1000000)
logger = logging.getLogger('castle.cms')
ANONYMOUS_USER = "******"

_truncate_cleaner = Cleaner(scripts=True,
                            javascript=True,
                            comments=True,
                            style=True,
                            links=True,
                            meta=True,
                            page_structure=True,
                            embedded=True,
                            frames=True,
                            forms=True,
                            annoying_tags=True,
                            remove_tags=('div', ),
                            kill_tags=('img', 'hr'),
                            remove_unknown_tags=True)


def truncate_text(text, max_words=30, more_link=None, clean=False):
    """
    adapted from Django
    """

    if not isinstance(text, basestring):
        return ''
Esempio n. 6
0
import lxml.etree
from lxml.html.clean import Cleaner
from w3lib.html import strip_html5_whitespace
import html_text

from extruct.utils import parse_html

# Cleaner which is similar to html_text cleaner, but is less aggressive
cleaner = Cleaner(
    scripts=True,
    javascript=False,  # onclick attributes are fine
    comments=True,
    style=True,
    links=True,
    meta=True,
    page_structure=False,  # <title> may be nice to have
    processing_instructions=True,
    embedded=False,  # keep embedded content
    frames=False,  # keep frames
    forms=False,  # keep forms
    annoying_tags=False,
    remove_unknown_tags=False,
    safe_attrs_only=False,
)


class LxmlMicrodataExtractor(object):
    _xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
    _xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
                                                  .//*[@itemscope]//*[@itemprop])""",
                                namespaces={"set": "http://exslt.org/sets"})
    _xp_clean_text = lxml.etree.XPath(
Esempio n. 7
0
    get_class_weight,
    get_link_density,
    is_unlikely_node,
    score_candidates,
)
from .utils import cached_property, shrink_text

html_cleaner = Cleaner(scripts=True,
                       javascript=True,
                       comments=True,
                       style=True,
                       links=True,
                       meta=False,
                       add_nofollow=False,
                       page_structure=False,
                       processing_instructions=True,
                       embedded=False,
                       frames=False,
                       forms=False,
                       annoying_tags=False,
                       remove_tags=None,
                       kill_tags=("noscript", "iframe"),
                       remove_unknown_tags=False,
                       safe_attrs_only=False)

SCORABLE_TAGS = ("div", "p", "td", "pre", "article")
ANNOTATION_TAGS = (
    "a",
    "abbr",
    "acronym",
    "b",
Esempio n. 8
0
        strings = [strings]
    strings = [_collect_string_content(x) for x in strings]
    res = []
    for s in strings:
        try:
            res.append(s.replace(old, new))
        except:
            pass
    return res


_replace_line = re.compile("(<br>)|(</p>)|(</li>|</div>)", re.I | re.S)
_replace_blank = re.compile("""(?:<.*?>)|\r|\t""", re.I | re.S)
_remove_multi_line = re.compile('\n+', re.I | re.S)
_special_char = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
_html_cleaner = Cleaner(style=True)


def _unescape(s):
    if '&' not in s:
        return s

    def replaceEntities(s):
        s = s.groups()[0]
        special_char_dict = {
            "amp": "&",
            "lt": "<",
            "gt": ">",
            "nbsp": " ",
        }
        try:
Esempio n. 9
0
def html_clean(str):
    """ Clean up HTML to be safe """
    cleaner = Cleaner(safe_attrs_only=True)
    return cleaner.clean_html(str)
Esempio n. 10
0
SearchEngine = namedtuple('Engine', ['name', 'api', 'charset'])

search_engine = [
    SearchEngine(name='BAIDU', api='http://www.baidu.com/s?wd={}', charset='utf-8'),
    # SearchEngine(name='SOGOU', api='http://www.sogou.com/web?query={}', charset='utf-8'),
    SearchEngine(name='BDZD', api='https://zhidao.baidu.com/search?word={}', charset='gbk'),
    # SearchEngine(name='BING', api='http://cn.bing.com/search?q={}', charset='utf-8'),
    # SearchEngine(name='GOOGLE', api='https://www.google.com.hk/search?newwindow=1&hl=zh-CN&q={}', charset='utf-8'),
]

category_list = ['summary', 'option']
# category_list = ['summary']

old_question = []

cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False)

headers = {
    'Host': 'msg.api.chongdingdahui.com',
    'User-Agent': 'LiveTrivia/1.0.4 (com.chongdingdahui.app; build:0.1.7; iOS 11.2.2) Alamofire/4.6.0',
    'X-Live-App-Version': '1.0.4',
    'Content-Type': 'application/json',
    'X-Live-Device-Identifier': 'AC654DF3-402D-40B3-BF20-19D8A5B57793',
    # 'X-Live-Session-Token': '1.3071218.845633.ZtC.d650c387f4c187ce54b2ea432bfa4f51',
    'X-Live-Session-Token': '1.3071218.2811521.lao.5dc60efb955f70eb1e981ceb422e0eae',
    'X-Live-Device-Type': 'ios',
    'X-Live-OS-Version': 'Version 11.2.2 (Build 15C202)',
}

search_header = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
Esempio n. 11
0
def clean_html(text):
    from lxml.html.clean import Cleaner
    return Cleaner(links=False, style=True).clean_html(text)
Esempio n. 12
0
import lxml.html
from lxml.html.clean import Cleaner

# ダウンロードした XHTML ファイルのファイル名を書きます。
# ちなみに 789_14547.html は《吾輩は猫である》です。
FILE_NAME = 'data/xhtml/789_14547.html'

with open(FILE_NAME, encoding='shift_jis') as f:
    data = f.read().encode('shift_jis')

cleaner = Cleaner(page_structure=False, remove_tags=(
    'ruby', 'br'), kill_tags=('rt', 'rp'))
cln_html = cleaner.clean_html(data).decode('utf-8')

plain_text = lxml.html.fromstring(cln_html).find_class('main_text')[
    0].text_content()
# print(plain_text)

# 別ファイルへの保存
PLAIN_TEXT = FILE_NAME.replace('xhtml', 'text').replace('.html', '.txt')
print(PLAIN_TEXT)
with open(PLAIN_TEXT, 'w') as f:
    f.write(plain_text)
Esempio n. 13
0
def cmd_clean(root, **kwargs):
    return Cleaner(**kwargs).clean_html(root)
# -----------------------------------------------------------------------------
# 1. Parse the raw HTML to get the interesting bits - the part inside <td> tags.
# -- UNCOMMENT THE 6 LINES BELOW (i.e. delete the # at the start of the lines)
# -- CLICK THE 'RUN' BUTTON BELOW
# Check the 'Console' tab again, and you'll see how we're extracting
# the HTML that was inside <td></td> tags.
# We use lxml, which is a Python library especially for parsing html.
# -----------------------------------------------------------------------------

html = html.replace('<br>', ' ')
html = re.sub(r'(\&.*?;)|(\n|\t|\r)', ' ', html)
print html
issues = []
root = lxml.html.fromstring(html)  # turn our HTML into an lxml object
cleaner = Cleaner(remove_tags=['font', 'span'],
                  links=False,
                  remove_unknown_tags=False)
root = cleaner.clean_html(root)
newhtml = lxml.html.tostring(root)

record = {}
datestring = re.findall("Updated (.*?)</p>", newhtml)[0]
date = time.strptime(
    datestring,
    '%b %d, %Y')  # encode the date as a date using format Month dd, YYYY
date = time.strftime(
    '%Y-%m-%d',
    date)  # decode that date back into a string of format YYYY-mm-dd

if scraperwiki.sqlite.get_var(
        'last_update'
Esempio n. 15
0
from lxml import etree
from lxml.html.clean import Cleaner
import sys

doc = open(sys.argv[1]).read()
cleaner = Cleaner(page_structure=False)

doc = cleaner.clean_html(doc)
tree = etree.HTML(doc)

# gives the heading
# heading = tree.xpath('//*[@id="firstClickFreeAllowed"]/div[1]/div/div[1]/div/div/div/header/h2')
# print etree.tostring(heading[0],pretty_print=True)

# gives the defination
definations = tree.xpath('//*[@id="firstClickFreeAllowed"]/div[1]/div')
totalmeanings = len(definations)
i = 1
print '''
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" class="no-js">
<body class=" ad_trick">
'''
for defi in definations:

    print etree.tostring(
        tree.xpath('//*[@id="firstClickFreeAllowed"]/div[1]/div[' + str(i) +
                   ']/div[1]')[0],
        pretty_print=True)

    i = i + 1
class feed_reader:
    """parse a list of feeds and return details as dictionary data"""
    #create the html cleaner, this is to clean out unwanted html tags in the description text
    #page_structure=True,remove_unknown_tags=True
    html_cleaner = Cleaner()
    html_cleaner.javascript = True
    html_cleaner.style = True
    html_cleaner.remove_tags = [
        'script', 'iframe', 'link', 'style', 'img', 'div'
    ]
    #~ html_cleaner.allow_tags = ['a', 'p', 'strong']

    filter_by_date_expire = datetime.datetime.now() - datetime.timedelta(
        days=int(1.5 * 365))  #  1 and a half years ago

    html_img_cleaner = Cleaner(allow_tags=['img'], remove_unknown_tags=False)
    html_img_cleaner.allow_tags = ['img']

    html_parser = lxml.etree.HTMLParser()
    xml_parser = lxml.etree.XMLParser(remove_blank_text=True,
                                      ns_clean=True,
                                      encoding='utf-8')

    enable_date_filter = True

    def __init__(self, feed_details, timeout=5):
        self.results = {}
        for feed_info in feed_details:
            self.url = feed_info.get('url')
            self.author = feed_info.get('author')
            self.tags = feed_info.get('tags')
            if feed_info.get('url').startswith('http:'):
                try:
                    response = requests.get(feed_info.get('url'),
                                            stream=True,
                                            timeout=timeout)
                except requests.exceptions.Timeout as e:
                    continue
                if response.headers.get('content-encoding') == 'gzip':
                    response.raw.read = functools.partial(response.raw.read,
                                                          decode_content=True)
                try:
                    self.feed = lxml.etree.parse(response.raw, self.xml_parser)
                except:
                    continue
            else:
                with open(os.path.abspath(feed_info.get('url')),
                          'r') as file_stream:
                    try:
                        self.feed = lxml.etree.parse(file_stream,
                                                     self.xml_parser)
                    except:
                        continue

            self.feed = self.feed.getroot()

            # rss feed defaults
            self.channel_image = self.fetch_node_text(self.feed,
                                                      'channel/image/url', '')

            self.parse_feed()

    def convert_rfc822_to_datetime(self, rfcdate):
        """rss uses rfc822 dates so lets convert them to datetime for use later"""
        if len(rfcdate):
            parsed_rfcdate = parsedate_tz(rfcdate)
            if not parsed_rfcdate:
                return None
            return datetime.datetime.fromtimestamp(
                mktime_tz(parsed_rfcdate), pytz.utc).replace(tzinfo=None)
        return None

    def clean_up_text(self, text):
        """strip out any dirty tags like <script> they may break the sites"""
        if text is None:
            return ''
        cleaned_html = self.html_cleaner.clean_html(text)

        # parse large text seperately
        if len(text) > 600:
            description = lxml.etree.parse(StringIO.StringIO(cleaned_html),
                                           self.html_parser)
            root = description.getroot()
            build = ''
            for node in root[-1][-1].iter():
                #skip any nodes with no text
                if node.text is None and node.tail is None:
                    continue
                # we may want to do some other node checks here
                # perhaps count paragraphs, html layout changes a lot
                if node.tag == 'br':
                    return build
                else:
                    if node.tag == 'a' and node.text is None:
                        build += node.tail
                    else:
                        build += etree.tostring(node)

        return self.html_cleaner.clean_html(text)

    def fetch_image_from_node_text(self, text):
        description = lxml.etree.parse(StringIO.StringIO(text),
                                       self.html_parser)
        for image in description.xpath('.//img'):
            return image.get('src')
        return None

    def fetch_image(self, node):
        """Try and get an image from an item in the feed, use various fall back methods"""
        image = node.xpath('media:thumbnail', namespaces=namespaces)
        if image:
            return image[0].get('url', '')

        # no media:thumbnail so lets try and grab an image from content:encoded
        image = node.xpath('content:encoded', namespaces=namespaces)
        if image:
            image = self.fetch_image_from_node_text(image[0].text)
            if image:
                return image

        # final attempt at getting an image from the item using description
        result = self.fetch_node_text(node, 'description')
        if result:
            image = self.fetch_image_from_node_text(result)
            if image:
                return image

        # no image so lets fall back to the channel image if it exists
        return self.channel_image

    def fetch_node_text(self, node, name, default=''):
        """fetch the text from the node we are given, we are working in unicode
        so decode byte strings to unicode"""
        result = node.xpath('./%s' % name)
        if result is None or len(result) is 0:
            return default

        if type(result[-1].text) is str:
            return result[-1].text.decode('utf8')
        else:
            return result[-1].text

    def fetch_node_attribute(self, node, name, attribs, default):
        result = node.xpath('./%s' % name)
        if result:
            return result.get(attribs, '')
        else:
            return default

    def format_author(self, author):
        """extract the authors name from the author text node"""
        return author.split('(')[-1].strip(')')

    def filter_by_tags(self, node, tags=None):
        """filter the feed out by category tag, if no tags assume its pre filtered"""
        if self.tags is None:
            return True
        for category in node.xpath('./category', namespaces=namespaces):
            if category.text.lower() in self.tags:
                return True
        return False

    def filter_by_date(self, date):
        """filter the feed out by date"""
        if self.enable_date_filter is False:
            return True
        if date > self.filter_by_date_expire:
            return True
        return False

    def parse_feed(self):
        """Parse the items in the feed, filter out bad data and put in defaults"""
        for item in self.feed.xpath('.//item', namespaces=namespaces):
            date = self.convert_rfc822_to_datetime(
                self.fetch_node_text(item, 'pubDate'))
            if self.filter_by_date(date) and self.filter_by_tags(item):
                author = self.format_author(
                    self.fetch_node_text(item, 'author', self.author))
                self.results.setdefault(author, []).append({
                    #~ self.results.append({
                    'title':
                    self.fetch_node_text(item, 'title'),
                    'date':
                    date,
                    'url':
                    self.fetch_node_text(item, 'link'),
                    'author':
                    author,
                    'image':
                    self.fetch_image(item),
                    'description':
                    self.clean_up_text(
                        self.fetch_node_text(item, 'description'))
                })

        #order authors articles by date
        for author in self.results.keys():
            self.results[author] = sorted(self.results[author],
                                          key=itemgetter('date'),
                                          reverse=True)

    def alternate_dict_and_sort_by_list_item_key(self,
                                                 dict_of_lists,
                                                 sort_key='date'):
        """ take a dictonary of ordered lists, step through each row and sort the current
        item position in each list and yield the result.
        
        basically gives the ordering of date while stepping through the blog entries to make it fair
        for people who do not blog often. """

        longest_list_length = max(
            [len(dict_of_lists[d]) for d in dict_of_lists.keys()])
        for i in xrange(0, longest_list_length):
            #get first value from each key, and order the list
            results = sorted([d.pop() for d in dict_of_lists.values() if d],
                             key=itemgetter(sort_key),
                             reverse=True)
            for item in results:
                yield item

    def __iter__(self):
        """return results ordered by date"""
        for author in self.alternate_dict_and_sort_by_list_item_key(
                self.results):
            yield author
Esempio n. 17
0
from app.chat.forms import LoginForm, RoomAddForm, ChangeNicknameForm
from app.chat.models import User, ROLE_USER, Room
from config import DATABASE_QUERY_TIMEOUT, OPENID_PROVIDERS
from flask import g, render_template, flash, url_for, request, session, redirect
from flask.ext.login import current_user, logout_user, login_user, login_required
from flask.ext.socketio import join_room, emit, leave_room
from flask.ext.sqlalchemy import get_debug_queries
from lxml.html import fromstring, iterlinks, make_links_absolute
from lxml.html.clean import Cleaner, autolink_html
from markupsafe import Markup


cleaner = Cleaner(
    style=True,
    links=True,
    add_nofollow=True,
    page_structure=True,
    safe_attrs_only=False,
    remove_tags=['p']
)


@lm.user_loader
def load_user(id):
    return User.query.get(int(id))


@chat.before_request
def before_request():
    g.user = current_user
    if g.user.is_authenticated():
        g.user.last_seen = datetime.utcnow()
Esempio n. 18
0
normHtmlFile = lzma.open(options.outDir + "/" + options.prefix + "normalized_html.xz", "w")
plainTextFile = lzma.open(options.outDir + "/" + options.prefix + "plain_text.xz", "w")
# Boilerpipe cleaning is optional
if options.boilerpipe:
    deboilFile = lzma.open(options.outDir + "/" + options.prefix + "deboilerplate_html.xz", "w")

for record in f:
    # We convert into UTF8 first of all
    orig_encoding, text = convert_encoding(record.payload.read())
    url = record.url
    if orig_encoding is None:
        logging.info("Encoding of document " + url + " could not be identified")

    if len(text) > 0:
        # HTML is then normalized
        cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False)

        tree=""
        try:
            cleanhtml = cleaner.clean_html(re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE))
            tree = ftfy.fix_text(cleanhtml, fix_entities=False, fix_character_width=False)
            #document = html5lib.parse(fixedtext, treebuilder="lxml", namespaceHTMLElements=False)
            #tree = etree.tostring(document, encoding="utf-8")
        except Exception as ex:
            sys.stderr.write(str(ex)+"\n")
            continue
        cleantree = tree.replace("&#160;", " ")
        cleantree = cleantree.replace("\t", " ")

        # lang id
        #printable_str = ''.join(x for x in cleantree if x in string.printable)
Esempio n. 19
0
def cleanup(data,tags):
    cleaner= Cleaner(remove_tags=tags)
    clean=cleaner.clean_html(data)
    root = lxml.html.fromstring(clean)
    return root
Esempio n. 20
0
  def get_message_tree(self):
    tree = {
      'id': self.get_msg_info(self.index.MSG_ID),
      'tags': self.get_msg_info(self.index.MSG_TAGS).split(','),
      'summary': self.get_msg_summary(),
      'headers': {},
      'headers_lc': {},
      'attributes': {},
      'text_parts': [],
      'html_parts': [],
      'attachments': [],
      'conversation': [],
    }

    conv_id = self.get_msg_info(self.index.MSG_CONV_ID)
    if conv_id:
      conv = Email(self.index, int(conv_id, 36))
      tree['conversation'] = convs = [conv.get_msg_summary()]
      for rid in conv.get_msg_info(self.index.MSG_REPLIES).split(','):
        if rid:
          convs.append(Email(self.index, int(rid, 36)).get_msg_summary())

    # FIXME: Decide if this is strict enough or too strict...?
    html_cleaner = Cleaner(page_structure=True, meta=True, links=True,
                           javascript=True, scripts=True, frames=True,
                           embedded=True, safe_attrs_only=True)

    msg = self.get_msg()
    for hdr in msg.keys():
      tree['headers'][hdr] = self.index.hdr(msg, hdr)
      tree['headers_lc'][hdr.lower()] = self.index.hdr(msg, hdr)

    # Note: count algorithm must match that used in extract_attachment above
    count = 0
    for part in msg.walk():
      mimetype = part.get_content_type()
      if mimetype.startswith('multipart/'):
        continue

      count += 1
      if (part.get('content-disposition', 'inline') == 'inline'
      and mimetype in ('text/plain', 'text/html')):
        payload, charset, openpgp = self.decode_payload(part)
        # FIXME: Do something with the openpgp data!
        if (mimetype == 'text/html' or
            '<html>' in payload or
            '</body>' in payload):
          tree['html_parts'].append({
            'openpgp_status': openpgp and openpgp[0] or '',
            'openpgp_data': openpgp and openpgp[1] or '',
            'charset': charset,
            'type': 'html',
            'data': (payload.strip() and html_cleaner.clean_html(payload)) or ''
          })
        else:
          tree['text_parts'].extend(self.parse_text_part(payload, charset,
                                                         openpgp))
      else:
        tree['attachments'].append({
          'mimetype': mimetype,
          'count': count,
          'part': part,
          'length': len(part.get_payload(None, True) or ''),
          'content-id': part.get('content-id', ''),
          'filename': part.get_filename() or ''
        })

    if self.is_editable():
      tree['is_editable'] = True
      tree['editing_string'] = self.get_editing_string(tree)

    return tree
Esempio n. 21
0
 def __init__(self, html):
     cleaner = Cleaner(style=True, page_structure=False, \
                       remove_tags=('br',), safe_attrs_only=False)
     self.html = cleaner.clean_html(html)
Esempio n. 22
0
    def handle(self, *args, **options):
        individuals = Individual.objects.all()
        for y1, y2 in year_ranges:
            url = url_pattern % (y1, y2, y1, y2)
            r = requests.get(url)
            r.encoding = "utf-8"
            output = r.text
            root = etree.HTML(output)
            dates = [
                d.text for d in root.xpath(
                    "//h2[@class=\"h3_style\"]/a[contains(@href,\"agenda\")]")
            ]
            tables = root.xpath("//table[@class=\"interlaced\"]")
            if len(dates) != len(tables):
                raise Exception("Dates and Questions Mismatch! %d <> %d" %
                                (len(dates), len(tables)))

            for i in range(0, len(dates)):
                date = datetime.strptime(dates[i], '%d.%m.%Y')
                print date
                table = tables[i]
                for row in table.xpath(".//tr")[1:]:
                    cells = row.xpath("td")
                    if all_text(cells[3]).strip() == '-':
                        continue
                    legislator_name = cells[1].text
                    if legislator_name.startswith(u"郭偉强"):
                        legislator_name = u"郭偉強"
                    title = all_text(cells[2])
                    question_type_text = all_text(cells[0])
                    individual = None
                    for p in individuals:
                        if legislator_name.startswith(p.name_ch):
                            individual = p
                            break
                    if individual is None:
                        print(legislator_name)
                        raise Exception("Individual not found. ",
                                        legislator_name)
                    link = cells[3].xpath(".//a")[0].attrib['href']
                    key = str(md5.new(link).hexdigest())
                    m = re.match(r"(.*[0-9]+|UQ)[\(]{0,1}(.*)\)",
                                 question_type_text)
                    if m is None:
                        raise Exception("Undefined Question Type", link,
                                        question_type_text)
                    question_type = m.group(2)
                    detail_r = requests.get(link)
                    detail_r.encoding = "big5"
                    output = detail_r.text
                    cleaner = Cleaner(comments=False)
                    output = cleaner.clean_html(output)
                    detail_root = etree.HTML(output)
                    try:
                        press_release = all_text(
                            detail_root.xpath("//div[@id=\"pressrelease\"]")
                            [0])
                    except IndexError:
                        detail_r = requests.get(link)
                        detail_r.encoding = "utf-8"
                        output = detail_r.text
                        output = cleaner.clean_html(output)
                        detail_root = etree.HTML(output)
                        press_release = all_text(
                            detail_root.xpath("//span[@id=\"pressrelease\"]")
                            [0])
                    question_start = press_release.find(u'以下')
                    reply_start = press_release.rfind(u'答覆:')
                    question_text = press_release[question_start:reply_start]
                    answer_text = press_release[reply_start + 3:]
                    #print(question_text)
                    #print(answer_text)
                    #print link
                    #print date
                    #print individual.name_en
                    #print key
                    #print question_type
                    question = Question()
                    question.key = key
                    question.individual = individual
                    question.date = date
                    question.question_type = question_type
                    question.question = question_text
                    question.answer = answer_text
                    question.title = title
                    question.link = link
                    question.title_ch = title
                    try:
                        question.save()
                    except IntegrityError:
                        print("%s %s already exists" % (str(date), title))
Esempio n. 23
0
cleaner = Cleaner(
    allow_tags=(
        "a",
        "img",
        "h1",
        "h2",
        "h3",
        "strong",
        "em",
        "b",
        "i",
        "sub",
        "sup",
        "p",
        "br",
        "hr",
        "pre",
        "div",
        "ul",
        "ol",
        "li",
        "table",
        "thead",
        "tbody",
        "tr",
        "th",
        "td",
    ),
    remove_unknown_tags=False,
    safe_attrs=set(["class", "href", "src", "alt"]),
)
Esempio n. 24
0

def get_best_next_url(*urls):
    """Returns the safest URL to redirect to from a given list."""
    for url in urls:
        url = urljoin(settings.APP_UI_URL, url)
        if url and is_safe_url(url):
            return url
    return settings.APP_UI_URL


CLEANER = Cleaner(style=True,
                  meta=True,
                  links=False,
                  remove_tags=['body', 'form'],
                  kill_tags=[
                      'area', 'audio', 'base', 'bgsound', 'embed', 'frame',
                      'frameset', 'head', 'img', 'iframe', 'input', 'link',
                      'map', 'meta', 'nav', 'object', 'plaintext', 'track',
                      'video'
                  ])


def sanitize_html(html_text, base_url):
    """Remove anything from the given HTML that must not show up in the UI."""
    # TODO: circumvent encoding declarations?
    if html_text is None:
        return
    cleaned = CLEANER.clean_html(html_text)
    html = document_fromstring(cleaned)
    for (el, attr, href, _) in html.iterlinks():
        href = normalize_href(href, base_url)
Esempio n. 25
0
def serve(base_folder_path) -> Flask:
    app = Flask(__name__)

    app.jinja_env.trim_blocks = True
    app.jinja_env.lstrip_blocks = True

    recipe_parser = RecipeParser()
    recipe_serializer = RecipeSerializer()

    _cleaner = Cleaner(meta=True,
                       embedded=True,
                       links=True,
                       style=True,
                       processing_instructions=True,
                       scripts=True,
                       javascript=True,
                       frames=True,
                       remove_unknown_tags=True,
                       page_structure=True,
                       remove_tags=['body'])

    @app.context_processor
    def pjax_processor():
        def get_root_template():
            if "X-PJAX" in request.headers:
                return "pjax.html"
            return "structure.html"

        return dict(get_root_template=get_root_template)

    @app.template_filter()
    def markdown_to_cleaned_html(markdown):
        unsafe_html_str = commonmark.commonmark(markdown)
        # remove wrapping div
        # https://stackoverflow.com/questions/21420922/how-to-use-cleaner-lxml-html-without-returning-div-tag
        unsafe_doc = document_fromstring(unsafe_html_str)
        clean_doc = _cleaner.clean_html(unsafe_doc)
        clean_html_str = "\n".join(
            tostring(ch, encoding="unicode") for ch in clean_doc)
        return Markup(clean_html_str)

    @app.template_filter()
    def get_recipe_title(child_name: str, parent_path) -> str:
        absolute_path = os.path.join(base_folder_path, parent_path, child_name)
        if os.path.isdir(absolute_path):
            return Markup('<em>Folder</em>')
        try:
            with open(absolute_path, 'r', encoding='UTF-8') as f:
                recipe = recipe_parser.parse(f.read())
            # TODO markdown to html
            return recipe.title
        except RuntimeError:
            return Markup('<strong>Invalid recipe!</strong>')

    @app.template_filter()
    def serialize_ingredients(ingredients: List[Ingredient]):
        return ("\n".join(
            recipe_serializer._serialize_ingredient(i, rounding=2)
            for i in ingredients)).strip()

    @app.route('/')
    @app.route('/<path:relative_path>')
    def download_file(relative_path=''):
        absolute_path = os.path.join(base_folder_path, relative_path)

        if os.path.isdir(absolute_path):
            if not absolute_path.endswith('/'):
                return redirect(f'/{relative_path}/', code=302)

            child_paths = [(ch, os.path.isdir(os.path.join(absolute_path, ch)))
                           for ch in os.listdir(absolute_path)]
            child_paths = [
                (ch, is_dir) for ch, is_dir in child_paths
                if not ch.startswith('.') and (is_dir or ch.endswith('.md'))
            ]
            child_paths = [
                f'{ch}/' if not ch.endswith('/') and is_dir else ch
                for ch, is_dir in child_paths
            ]
            child_paths = sorted(child_paths)
            return render_template("folder.html",
                                   child_paths=child_paths,
                                   path=relative_path)

        if not absolute_path.endswith('.md'):
            return send_from_directory(base_folder_path, relative_path)

        with open(absolute_path, 'r', encoding='UTF-8') as f:
            required_yield_str = request.args.get('yield', '1')
            required_yield = recipe_parser.parse_amount(required_yield_str)
            if required_yield is None:
                required_yield = Amount(factor=Decimal(1))

            src = f.read()

            try:
                recipe = recipe_parser.parse(src)
            except Exception as e:
                return render_template("markdown.html",
                                       markdown=src,
                                       path=relative_path,
                                       errors=[e.args[0]])

            errors = []
            try:
                recipe = get_recipe_with_yield(recipe, required_yield)
            except StopIteration:
                errors.append(
                    f'The recipe does not specify a yield in the unit "{required_yield.unit}". '
                    f'The following units can be used: ' +
                    ", ".join(f'"{y.unit}"' for y in recipe.yields))
            except Exception as e:
                errors.append(str(e))

            return render_template(
                "recipe.html",
                recipe=recipe,
                yields=recipe_serializer._serialize_yields(recipe.yields,
                                                           rounding=2),
                tags=recipe_serializer._serialize_tags(recipe.tags),
                units=list(set(y.unit for y in recipe.yields)),
                default_yield=recipe_serializer._serialize_amount(
                    recipe.yields[0]) if recipe.yields else "1",
                path=relative_path,
                errors=errors)

    return app
Esempio n. 26
0
import logging
import re

from lxml import etree
from lxml.html.clean import Cleaner

from .filters import duplicate_test, textfilter
from .settings import CUT_EMPTY_ELEMS, DEFAULT_CONFIG, MANUALLY_CLEANED, MANUALLY_STRIPPED
from .utils import trim
from .xpaths import COMMENTS_DISCARD_XPATH, DISCARD_XPATH

LOGGER = logging.getLogger(__name__)

# HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
#HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
Esempio n. 27
0
class HackathonManager(Component):
    """Component to manage hackathon

    Note that it only handle operations directly related to Hackathon table. Things like registerd users, templates are
    in separated components
    """

    admin_manager = RequiredFeature("admin_manager")
    user_manager = RequiredFeature("user_manager")
    register_manager = RequiredFeature("register_manager")

    # basic xss prevention
    cleaner = Cleaner(safe_attrs=lxml.html.defs.safe_attrs
                      | set(['style']))  # preserve style

    def is_hackathon_name_existed(self, name):
        """Check whether hackathon with specific name exists or not

        :type name: str|unicode
        :param name: name of hackathon

        :rtype: bool
        :return True if hackathon with specific name exists otherwise False
        """
        hackathon = self.get_hackathon_by_name(name)
        return hackathon is not None

    def is_recycle_enabled(self, hackathon):
        key = HACKATHON_CONFIG.RECYCLE_ENABLED
        return self.get_basic_property(hackathon, key, False)

    def get_hackathon_by_name(self, name):
        """Get hackathon accoring the unique name

        :type name: str|unicode
        :param name: name of hackathon

        :rtype: Hackathon
        :return hackathon instance if found else None
        """
        if not name:
            return None

        return Hackathon.objects(name=name).first()

    def get_hackathon_by_id(self, hackathon_id):
        """Query hackathon by id
        :type hackathon_id: str or ObjectId are both ok
        :param hackathon_id: _id of hackathon

        :return hackathon instance or None
        """
        return Hackathon.objects(id=hackathon_id).first()

    def get_hackathon_detail(self, hackathon):
        user = None
        if self.user_manager.validate_login():
            user = g.user

        return self.__get_hackathon_detail(hackathon, user)

    def get_hackathon_stat(self, hackathon):
        def internal_get_stat():
            return self.__get_hackathon_stat(hackathon)

        cache_key = "hackathon_stat_%s" % hackathon.id
        return self.cache.get_cache(key=cache_key,
                                    createfunc=internal_get_stat)

    # TODO: implement HackathonStat related features: order_by == 'registered_users_num':
    def get_hackathon_list(self, args):
        # get values from request's QueryString
        page = int(args.get("page", 1))
        per_page = int(args.get("per_page", 20))
        order_by = args.get("order_by", "create_time")
        status = args.get("status")
        name = args.get("name")

        # build query by search conditions and order_by
        status_filter = Q()
        name_filter = Q()
        condition_filter = Q()
        order_by_condition = '-id'

        if status:
            status_filter = Q(status=status)
        if name:
            name_filter = Q(name__contains=name)

        if order_by == 'create_time':  # 最新发布
            order_by_condition = '-create_time'
        elif order_by == 'event_start_time':  # 即将开始
            order_by_condition = '-event_start_time'
        elif order_by == 'registered_users_num':  # 人气热点
            # hackathons with zero registered users would not be shown.
            hot_hackathon_stat = HackathonStat.objects(
                type=HACKATHON_STAT.REGISTER, count__gt=0).order_by('-count')
            hot_hackathon_list = [
                stat.hackathon.id for stat in hot_hackathon_stat
            ]
            condition_filter = Q(id__in=hot_hackathon_list)
        else:
            order_by_condition = '-id'

        # perform db query with pagination
        pagination = Hackathon.objects(status_filter & name_filter
                                       & condition_filter).order_by(
                                           order_by_condition).paginate(
                                               page, per_page)

        hackathon_list = pagination.items
        hackathon_stat = HackathonStat.objects(hackathon__in=hackathon_list)

        user = None
        user_hackathon = []
        team = []
        if self.user_manager.validate_login():
            user = g.user
            user_hackathon = UserHackathon.objects(
                user=user, hackathon__in=hackathon_list)
            team = Team.objects(members__user=user,
                                hackathon__in=hackathon_list)

        def func(hackathon):
            return self.__fill_hackathon_detail(hackathon, user,
                                                hackathon_stat, user_hackathon,
                                                team)

        # return serializable items as well as total count
        return self.util.paginate(pagination, func)

    def get_online_hackathons(self):
        return Hackathon.objects(status=HACK_STATUS.ONLINE)

    def get_user_hackathon_list_with_detail(self, user_id):
        user_hackathon_rels = UserHackathon.objects(
            user=user_id, role=HACK_USER_TYPE.COMPETITOR).all()

        def get_user_hackathon_detail(user_hackathon_rel):
            dict = user_hackathon_rel.dic()
            dict["hackathon_info"] = user_hackathon_rel.hackathon.dic()
            return dict

        return [get_user_hackathon_detail(rel) for rel in user_hackathon_rels]

    def get_recyclable_hackathon_list(self):
        # todo filter hackathons by hackathon.config in a db-level if possible
        hackathons = Hackathon.objects(
            status=HACK_STATUS.ONLINE,
            event_start_time__lt=self.util.get_now(),
            event_end_time__gt=self.util.get_now()).all()
        return [h for h in hackathons if self.is_recycle_enabled(h)]

    def get_basic_property(self, hackathon, key, default=None):
        """Get basic property of hackathon from HackathonConfig"""
        if hackathon.config:
            return hackathon.config.get(key, default)
        return default

    def get_all_properties(self, hackathon):
        config = hackathon.config
        return config if config else {}

    def set_basic_property(self, hackathon, properties):
        """Set basic property in table HackathonConfig"""

        hackathon.config.update(properties)
        hackathon.save()

        self.cache.invalidate(self.__get_config_cache_key(hackathon))
        return ok()

    def delete_basic_property(self, hackathon, keys):
        if isinstance(keys, str):
            keys = keys.split()

        list(map(lambda key: hackathon.config.pop(key, None), keys))

        hackathon.save()
        self.cache.invalidate(self.__get_config_cache_key(hackathon))
        return ok()

    def get_recycle_minutes(self, hackathon):
        key = HACKATHON_CONFIG.RECYCLE_MINUTES
        minutes = self.get_basic_property(hackathon, key, 60)
        return int(minutes)

    def validate_hackathon_name(self):
        if HTTP_HEADER.HACKATHON_NAME in request.headers:
            try:
                hackathon_name = request.headers[HTTP_HEADER.HACKATHON_NAME]
                hackathon = Hackathon.objects(name=hackathon_name).first()
                if hackathon:
                    g.hackathon = hackathon
                    return True
                else:
                    self.log.debug("cannot find hackathon by name %s" %
                                   hackathon_name)
                    return False
            except Exception as ex:
                self.log.error(ex)
                self.log.debug("hackathon_name invalid")
                return False
        else:
            self.log.debug("hackathon_name not found in headers")
            return False

    def create_new_hackathon(self, context):
        """Create new hackathon based on the http body

        Hackathon name is unique so duplicated names are not allowd.

        :type context: Context
        :param context: the body of http request that contains fields to create a new hackathon

        :rtype: dict
        """
        if Hackathon.objects(name=context.name).count() > 0:
            raise PreconditionFailed("hackathon name already exists")

        self.log.debug("add a new hackathon:" + context.name)
        new_hack = self.__create_hackathon(g.user, context)

        self.create_hackathon_notice(
            new_hack.id, HACK_NOTICE_EVENT.HACK_CREATE,
            HACK_NOTICE_CATEGORY.HACKATHON)  # hackathon create

        # init data is for local only
        if self.util.is_local():
            self.__create_default_data_for_local(new_hack)

        return new_hack.dic()

    def update_hackathon(self, args):
        """Update hackathon properties

        :type args: dict
        :param args: arguments from http request body that contains properties with new values

        :rtype dict
        :return hackathon in dict if updated successfully.
        """
        hackathon = g.hackathon

        try:
            update_items = self.__parse_update_items(args, hackathon)
            self.log.debug("update hackathon items :" + str(list(args.keys())))

            if 'config' in update_items:
                self.set_basic_property(hackathon,
                                        update_items.get('config', {}))
                update_items.pop('config', None)

            # basic xss prevention
            if 'description' in update_items and update_items['description']:
                #update_items['description'] = self.cleaner.clean_html(update_items['description'])
                self.log.debug("hackathon description :" +
                               update_items['description'])

            hackathon.modify(**update_items)
            hackathon.save()

            return ok()
        except Exception as e:
            self.log.error(e)
            return internal_server_error("fail to update hackathon")

    def delete_hackathon(self):
        """delete hackathon
        :return hackathon in dict if updated successfully.
        """
        hackathon = g.hackathon
        try:
            UserHackathon.objects(hackathon=hackathon).delete()
            self.log.debug("delete hackathon:" + hackathon.name)
            hackathon.delete()
            hackathon.save()
            return ok()
        except Exception as e:
            self.log.error(e)
            return internal_server_error("fail to delete hackathon" +
                                         hackathon.name)

    def apply_online_hackathon(self, hackathon):
        """apply for onlining a hackathon, should be called by the hackathon creator
        :return hackathon in dict if updated successfully.
        """
        try:
            req = ok()
            if hackathon.status == HACK_STATUS.OFFLINE or hackathon.status == HACK_STATUS.DRAFT:
                hackathon.status = HACK_STATUS.APPLY_ONLINE
                hackathon.save()
            elif hackathon.status == HACK_STATUS.INIT:
                req = general_error(code=HTTP_CODE.CREATE_NOT_FINISHED)
            return req
        except Exception as e:
            self.log.error(e)
            return internal_server_error("fail to delete hackathon" +
                                         hackathon.name)

    def get_userlike_all_hackathon(self, user_id):
        user_hackathon_rels = UserHackathon.objects(user=user_id).all()

        def get_user_hackathon_detail(user_hackathon_rel):
            dict = user_hackathon_rel.dic()
            dict["hackathon_info"] = user_hackathon_rel.hackathon.dic()
            return dict

        return [get_user_hackathon_detail(rel) for rel in user_hackathon_rels]

    def like_hackathon(self, user, hackathon):
        user_hackathon = UserHackathon.objects(hackathon=hackathon,
                                               user=user).first()
        if user_hackathon and user_hackathon.like:
            return ok()

        if not user_hackathon:
            user_hackathon = UserHackathon(hackathon=hackathon,
                                           user=user,
                                           role=HACK_USER_TYPE.VISITOR,
                                           status=HACK_USER_STATUS.UNAUDIT,
                                           like=True,
                                           remark="")
            user_hackathon.save()
        if not user_hackathon.like:
            user_hackathon.like = True
            user_hackathon.save()

        # increase the count of users that like this hackathon
        self.increase_hackathon_stat(hackathon, HACKATHON_STAT.LIKE, 1)

        return ok()

    def unlike_hackathon(self, user, hackathon):
        user_hackathon = UserHackathon.objects(user=user,
                                               hackathon=hackathon).first()
        if user_hackathon:
            user_hackathon.like = False
            user_hackathon.save()

        # sync the like count
        like_count = UserHackathon.objects(hackathon=hackathon,
                                           like=True).count()
        self.update_hackathon_stat(hackathon, HACKATHON_STAT.LIKE, like_count)
        return ok()

    def update_hackathon_stat(self, hackathon, stat_type, count):
        """Increase or descrease the count for certain hackathon stat

        :type hackathon: Hackathon
        :param hackathon: instance of Hackathon to be counted

        :type stat_type: str|unicode
        :param stat_type: type of stat that defined in constants.py#HACKATHON_STAT

        :type count: int
        :param count: the new count for this stat item
        """
        stat = HackathonStat.objects(hackathon=hackathon,
                                     type=stat_type).first()
        if stat:
            stat.count = count
            stat.update_time = self.util.get_now()
        else:
            stat = HackathonStat(hackathon=hackathon,
                                 type=stat_type,
                                 count=count)

        if stat.count < 0:
            stat.count = 0
        stat.save()

    def increase_hackathon_stat(self, hackathon, stat_type, increase):
        """Increase or descrease the count for certain hackathon stat

        :type hackathon: Hackathon
        :param hackathon: instance of Hackathon to be counted

        :type stat_type: str|unicode
        :param stat_type: type of stat that defined in constants.py#HACKATHON_STAT

        :type increase: int
        :param increase: increase of the count. Can be positive or negative
        """
        stat = HackathonStat.objects(hackathon=hackathon,
                                     type=stat_type).first()
        if stat:
            stat.count += increase
        else:
            stat = HackathonStat(hackathon=hackathon,
                                 type=stat_type,
                                 count=increase)

        if stat.count < 0:
            stat.count = 0
        stat.update_time = self.util.get_now()
        stat.save()

    def get_distinct_tags(self):
        """Return all distinct hackathon tags for auto-complete usage"""
        return self.db.session().query(HackathonTag.tag).distinct().all()

    def create_hackathon_organizer(self, hackathon, body):
        organizer = Organization(id=uuid.uuid4(),
                                 name=body.name,
                                 description=body.get("description", ""),
                                 organization_type=body.organization_type,
                                 homepage=body.get("homepage", ""),
                                 logo=body.get("logo", ""))

        hackathon.organizers.append(organizer)
        hackathon.update_time = self.util.get_now()
        hackathon.save()
        return hackathon.dic()

    def update_hackathon_organizer(self, hackathon, body):
        organizer = hackathon.organizers.get(id=body.id)
        if not organizer:
            return not_found()

        organizer.name = body.get("name", organizer.name)
        organizer.description = body.get("description", organizer.description)
        organizer.homepage = body.get("homepage", organizer.homepage)
        organizer.logo = body.get("logo", organizer.logo)
        organizer.organization_type = body.get("organization_type",
                                               organizer.organization_type)

        hackathon.update_time = self.util.get_now()
        hackathon.save()
        return hackathon.dic()

    def delete_hackathon_organizer(self, hackathon, organizer_id):
        if hackathon.organizers.filter(id=organizer_id):
            hackathon.update(pull__organizers=hackathon.organizers.get(
                id=organizer_id))

        hackathon.update_time = self.util.get_now()
        hackathon.save()
        return ok()

    def create_hackathon_award(self, hackathon, body):
        level = int(body.get("level"))
        if level > 10:
            level = 10

        award = Award(id=uuid.uuid4(),
                      name=body.get("name"),
                      sub_name=body.get("sub_name"),
                      description=body.get("description"),
                      level=level,
                      quota=body.get("quota"),
                      award_url=body.get("award_url"))
        hackathon.update(push__awards=award)

        hackathon.update_time = self.util.get_now()
        hackathon.save()
        return ok()

    def update_hackathon_award(self, hackathon, body):
        award = hackathon.awards.get(id=body.get("id"))
        if not award:
            return not_found("award not found")

        level = award.level
        if "level" in body:
            level = int(body.get("level"))
            if level > 10:
                level = 10

        award.name = body.get("name", award.name)
        award.sub_name = body.get("sub_name", award.sub_name)
        award.description = body.get("description", award.description)
        award.level = body.get("level", level)
        award.quota = body.get("quota", award.quota)
        award.award_url = body.get("award_url", award.award_url)
        award.save()

        hackathon.update_time = self.util.get_now()
        hackathon.save()
        return ok()

    def delete_hackathon_award(self, hackathon, award_id):
        award = hackathon.awards.get(id=award_id)
        hackathon.update(pull__awards=award)
        hackathon.update_time = self.util.get_now()
        hackathon.save()

        # delete granted award in teams
        award_uuid = uuid.UUID(award_id)
        Team.objects(hackathon=hackathon,
                     awards=award_uuid).update(pull__awards=award_uuid)

        return ok()

    def list_hackathon_awards(self, hackathon):
        awards = hackathon.dic()["awards"]
        awards.sort(key=lambda award: -award["level"])
        return awards

    def get_hackathon_notice(self, notice_id):
        hackathon_notice = HackathonNotice.objects(id=notice_id).first()
        if not hackathon_notice:
            return not_found("hackathon_notice not found")

        return hackathon_notice.dic()

    def create_hackathon_notice(self,
                                hackathon_id,
                                notice_event,
                                notice_category,
                                body={}):
        """
        create hackathon notice with hackathon_id, notice_event, notice_category.
        notice 'content' and 'link' can be included in body (optional)

        :type hackathon_id: int
        :param hackathon_id: id of hackathon that the notice belongs to (-1 if the notice doesn't belong to a specfic hackathon)

        :type notice_event: Class HACK_NOTICE_EVENT
        :param notice_event: event that the notice is triggered by, used for notice filtering (see get_hackathon_notice_list())
                             more specfic than notice_category, new events can be added without disturbing front-end code

        :type notice_category: Class HACK_NOTICE_CATEGORY
        :param notice_category: category that the notice belongs to, used for notice filtering and notice properties display
                                at front-end (e.g. icons/descriptions, see oh.manage.notice.js & oh.site.hackathon.js),
                                more general than notice_event, if you want to add a new category in HACK_NOTICE_CATEGORY,
                                remember to update front-end js code as well.

        :type body: dict/Context, default value: {}
        :param body: other necessary information, e.g.: 'content'(notice's content), 'link'(notice's link), other keys for specfic uses

        :return: hackathon_notice in dict

        ::Example:
        :create_hackathon_notice(2, HACK_NOTICE_EVENT.xx, HACK_NOTICE_CATEGORY.yy, {'content': 'zz'})
            a new notice for a hackathon with id 2 is created for the propose of HACK_NOTICE_EVENT.xx. The notice's front-end icon
            and description is determined by HACK_NOTICE_CATEGORY.yy, while its content is 'zz' and its link url is ''

        :create_hackathon_notice(-1, HACK_NOTICE_EVENT.xx, HACK_NOTICE_CATEGORY.yy)
            a new notice not belongs to any hackathon is created for the propose of HACK_NOTICE_EVENT.xx. The notice's front-end icon
            and description is determined by HACK_NOTICE_CATEGORY.yy, while its content and link url is ''
        """
        hackathon_notice = HackathonNotice(content='',
                                           link='',
                                           event=notice_event,
                                           category=notice_category)

        hackathon = self.get_hackathon_by_id(hackathon_id)
        if hackathon:
            hackathon_notice.hackathon = hackathon

        # notice creation logic for different notice_events
        if hackathon:
            if notice_event == HACK_NOTICE_EVENT.HACK_CREATE:
                hackathon_notice.content = "%s即将火爆来袭,敬请期待!" % (
                    hackathon.display_name)
            # elif notice_event == HACK_NOTICE_EVENT.HACK_EDIT and hackathon:
            #     hackathon_notice.content = u"%s更新啦,快去看看!" % (hackathon.display_name)
            elif notice_event == HACK_NOTICE_EVENT.HACK_ONLINE:
                hackathon_notice.content = "%s开始啦,点击报名!" % (
                    hackathon.display_name)
                hackathon_notice.link = "/site/%s" % hackathon.name
            elif notice_event == HACK_NOTICE_EVENT.HACK_OFFLINE:
                hackathon_notice.content = "%s圆满结束,点击查看详情!" % (
                    hackathon.display_name)
                hackathon_notice.link = "/site/%s" % hackathon.name
            elif notice_event == HACK_NOTICE_EVENT.HACK_PLAN and body.get(
                    'receiver', None):
                user = body.get('receiver')
                old_hackathon_notice = HackathonNotice.objects(
                    receiver=user,
                    event=HACK_NOTICE_EVENT.HACK_PLAN,
                    hackathon=hackathon).first()
                if old_hackathon_notice:  # duplicate
                    return old_hackathon_notice.dic()

                hackathon_notice.content = "您有未完成的任务,请提交开发说明书"
                hackathon_notice.receiver = user
                hackathon_notice.link = "/site/%s/team" % (hackathon.name)
            else:
                pass

        if notice_event == HACK_NOTICE_EVENT.EXPR_JOIN and body.get('user_id'):
            user_id = body.get('user_id')
            user = self.user_manager.get_user_by_id(user_id)
            hackathon_notice.content = "用户 %s 开始编程" % (user.nickname)
        else:
            pass

        # use assigned value if content or link is assigned in body
        hackathon_notice.content = body.get('content',
                                            hackathon_notice.content)
        hackathon_notice.link = body.get('link', hackathon_notice.link)

        hackathon_notice.save(validate=False)

        self.log.debug(
            "a new notice is created: hackathon: %s, event: %d, category: %d" %
            (hackathon.name, notice_event, notice_category))
        return hackathon_notice.dic()

    def update_hackathon_notice(self, body):
        hackathon_notice = HackathonNotice.objects(id=body.get('id')).first()
        if not hackathon_notice:
            return not_found("hackathon_notice not found")

        hackathon_notice.content = body.get("content",
                                            hackathon_notice.content)
        hackathon_notice.link = body.get("link", hackathon_notice.link)
        hackathon_notice.category = body.get("category",
                                             hackathon_notice.category)
        hackathon_notice.update_time = self.util.get_now()

        hackathon_notice.save(validate=False)
        return hackathon_notice.dic()

    def delete_hackathon_notice(self, notice_id):
        hackathon_notice = HackathonNotice.objects(id=notice_id).first()
        if not hackathon_notice:
            return not_found('Hackathon notice not found')

        hackathon_notice.delete()
        return ok()

    def get_hackathon_notice_list(self, body):
        """
        list hackathon notices, notices are paginated, can be filtered by hackathon_name, event and category,
        can be ordered by update_time, event and category.

        :type body: Context
        :param body: valid key/values(all key/values are optional)
            body = {
                hackathon_name: string,                  // filter by hackathon_name, default unfiltered
                filter_by_user: '******' | 'all',         // filter by user, default filter all notice that has specfic receivers
                category: 'int[,int...]',                // filter by category, default unfiltered
                event: 'int[,int...]',                   // filter by event, default unfiltered
                order_by: 'time' | 'event' | 'category', // order by update_time, event, category, default by time
                page: int,                               // page number after pagination, start from 1, default 1
                per_page: int                            // items per page, default 1000
            }

        :return: json style text, see util.Utility

        ::Example:
        : body = { order_by: 'time', category: '1,2,3', page: 1, per_page: 6 }
            search first 6 notices ordered by time, filtered by: category in [1,2,3]
        : body = { hackathon_name: 'hackathon', event: '1', order_by: 'event' }
            search first 1000 notices ordered by event, filtered by event == 1 and hackathon_name == 'hackathon'
        """

        hackathon_name = body.get("hackathon_name")
        filter_by_user = body.get("filter_by_user", "")
        notice_category = body.get("category")
        notice_event = body.get("event")
        order_by = body.get("order_by", "time")
        page = int(body.get("page", 1))
        per_page = int(body.get("per_page", 1000))

        hackathon_filter = Q()
        category_filter = Q()
        event_filter = Q()
        user_filter = Q(receiver=None)
        is_read_filter = Q()
        order_by_condition = '-update_time'

        if hackathon_name:  #list notices that belong to specfic hackathon
            hackathon = Hackathon.objects(
                name=hackathon_name).only('name').first()
            if hackathon:
                hackathon_filter = Q(hackathon=hackathon)
            else:
                return not_found('hackathon_name not found')
        else:  #only list online hackathons' notices or notices that not belong to any hackathon
            online_hackathon = Hackathon.objects(status=HACK_STATUS.ONLINE)
            hackathon_filter = Q(hackathon__in=online_hackathon) | Q(
                hackathon=None)

        if filter_by_user:  # only return notices that are sent to the login user
            user = None
            if self.user_manager.validate_login():
                user = g.user
                user_filter = Q(receiver=user)
                if filter_by_user == 'unread':
                    is_read_filter = Q(is_read=False)
            else:
                return bad_request("please login first")

        if notice_category:
            notice_category_tuple = tuple(
                [int(category) for category in notice_category.split(',')])
            category_filter = Q(category__in=notice_category_tuple)
        if notice_event:
            notice_event_tuple = tuple(
                [int(event) for event in notice_event.split(',')])
            event_filter = Q(event__in=notice_event_tuple)

        if order_by == 'category':
            order_by_condition = '+category'
        elif order_by == 'event':
            order_by_condition = '+event'
        else:
            order_by_condition = '-update_time'

        pagination = HackathonNotice.objects(
            hackathon_filter & category_filter & event_filter & user_filter
            & is_read_filter).order_by(order_by_condition).paginate(
                page, per_page)

        def func(hackathon_notice):
            return hackathon_notice.dic()

        # return serializable items as well as total count
        return self.util.paginate(pagination, func)

    def check_notice_and_set_read_if_necessary(self, id):
        hackathon_notice = HackathonNotice.objects(id=id).first()
        if hackathon_notice:
            user = g.user
            if not user or user.id != hackathon_notice.receiver.id:  # not the user
                return ok()

            hackathon_notice.is_read = True
            if hackathon_notice.event == HACK_NOTICE_EVENT.HACK_PLAN:  # set is_read = True if dev_plan is complete
                user = hackathon_notice.receiver
                hackathon = hackathon_notice.hackathon
                team = Team.objects(members__user=user,
                                    hackathon=hackathon).first()
                if team:
                    if not team.dev_plan:  # the dev_plan isn't submitted
                        hackathon_notice.is_read = False
            hackathon_notice.save()
            return ok()

    def schedule_pre_allocate_expr_job(self):
        """Add an interval schedule job to check all hackathons"""
        next_run_time = self.util.get_now() + timedelta(seconds=3)
        self.scheduler.add_interval(
            feature="hackathon_manager",
            method="check_hackathon_for_pre_allocate_expr",
            id="check_hackathon_for_pre_allocate_expr",
            next_run_time=next_run_time,
            minutes=20)

    def __is_pre_allocate_enabled(self, hackathon):
        if hackathon.event_end_time < self.util.get_now():
            return False
        # using registration time for better test before event_start_time
        if hackathon.registration_start_time > self.util.get_now():
            return False
        if hackathon.status != HACK_STATUS.ONLINE:
            return False
        if hackathon.config.get(HACKATHON_CONFIG.CLOUD_PROVIDER,
                                CLOUD_PROVIDER.NONE) == CLOUD_PROVIDER.NONE:
            return False
        return hackathon.config.get(HACKATHON_CONFIG.PRE_ALLOCATE_ENABLED,
                                    False)

    def check_hackathon_for_pre_allocate_expr(self):
        """Check all hackathon for pre-allocate

        Add an interval job for hackathon if it's pre-allocate is enabled.
        Otherwise try to remove the schedule job
        """
        hackathon_list = Hackathon.objects()
        for hack in hackathon_list:
            job_id = "pre_allocate_expr_" + str(hack.id)
            is_job_exists = self.scheduler.has_job(job_id)
            if self.__is_pre_allocate_enabled(hack):
                if is_job_exists:
                    self.log.debug(
                        "pre_allocate job already exists for hackathon %s" %
                        str(hack.name))
                    continue

                self.log.debug("add pre_allocate job for hackathon %s" %
                               str(hack.name))
                next_run_time = self.util.get_now() + timedelta(
                    seconds=(20 * random.random()))
                pre_allocate_interval = self.__get_pre_allocate_interval(hack)
                self.scheduler.add_interval(
                    feature="expr_manager",
                    method="pre_allocate_expr",
                    id=job_id,
                    context=Context(hackathon_id=hack.id),
                    next_run_time=next_run_time,
                    seconds=pre_allocate_interval)
            elif is_job_exists:
                self.log.debug(
                    "remove job for hackathon %s since pre_allocate is disabled"
                    % str(hack.id))
                self.scheduler.remove_job(job_id)
        return True

    def hackathon_online(self, hackathon):
        req = ok()

        if hackathon.status == HACK_STATUS.DRAFT or hackathon.status == HACK_STATUS.OFFLINE or hackathon.status == HACK_STATUS.APPLY_ONLINE:
            if self.util.is_local() or hackathon.config.get(
                    'cloud_provider') == CLOUD_PROVIDER.NONE:
                req = ok()
            elif hackathon.config.get(
                    'cloud_provider') == CLOUD_PROVIDER.AZURE:
                raise NotImplementedError()

        elif hackathon.status == HACK_STATUS.ONLINE:
            req = ok()
        else:
            req = general_error(code=HTTP_CODE.CREATE_NOT_FINISHED)

        if req.get('error') is None:
            hackathon.status = HACK_STATUS.ONLINE
            hackathon.save()
            self.create_hackathon_notice(
                hackathon.id, HACK_NOTICE_EVENT.HACK_ONLINE,
                HACK_NOTICE_CATEGORY.HACKATHON)  # hackathon online

        return req

    def hackathon_offline(self, hackathon):
        req = ok()
        if hackathon.status == HACK_STATUS.ONLINE or hackathon.status == HACK_STATUS.DRAFT or hackathon.status == HACK_STATUS.APPLY_ONLINE:
            hackathon.status = HACK_STATUS.OFFLINE
            hackathon.save()
            self.create_hackathon_notice(
                hackathon.id, HACK_NOTICE_EVENT.HACK_OFFLINE,
                HACK_NOTICE_CATEGORY.HACKATHON)  # hackathon offline

        elif hackathon.status == HACK_STATUS.INIT:
            req = general_error(code=HTTP_CODE.CREATE_NOT_FINISHED)

        return req

    # TODO: we need to review those commented items one by one to decide the API output
    def __get_hackathon_detail(self, hackathon, user=None):
        """Return hackathon info as well as its details including configs, stat, organizers, like if user logon"""
        detail = hackathon.dic()

        detail["stat"] = {"register": 0, "like": 0}

        for stat in HackathonStat.objects(hackathon=hackathon):
            if stat.type == HACKATHON_STAT.REGISTER:
                detail["stat"]["register"] = stat.count
            elif stat.type == HACKATHON_STAT.LIKE:
                detail["stat"]["like"] = stat.count

        if user:
            user_hackathon = UserHackathon.objects(hackathon=hackathon,
                                                   user=user).first()
            if user_hackathon and user_hackathon.like:
                detail['like'] = user_hackathon.like

            detail["user"] = self.user_manager.user_display_info(user)
            detail["user"]["is_admin"] = user.is_super or (
                user_hackathon and user_hackathon.role == HACK_USER_TYPE.ADMIN)

            # TODO: we need to review those items one by one to decide the API output
            # asset = self.db.find_all_objects_by(UserHackathonAsset, user_id=user.id, hackathon_id=hackathon.id)
            # if asset:
            #     detail["asset"] = [o.dic() for o in asset]

            if user_hackathon and user_hackathon.role == HACK_USER_TYPE.COMPETITOR:
                detail["registration"] = user_hackathon.dic()
                team = Team.objects(hackathon=hackathon,
                                    members__user=user).first()
                if team:
                    detail["team"] = team.dic()

        return detail

    def __fill_hackathon_detail(self, hackathon, user, hackathon_stat,
                                user_hackathon, team):
        """Return hackathon info as well as its details including configs, stat, organizers, like if user logon"""
        detail = hackathon.dic()

        detail["stat"] = {"register": 0, "like": 0}

        for stat in hackathon_stat:
            if stat.type == HACKATHON_STAT.REGISTER and stat.hackathon.id == hackathon.id:
                detail["stat"]["register"] = stat.count
            elif stat.type == HACKATHON_STAT.LIKE and stat.hackathon.id == hackathon.id:
                detail["stat"]["like"] = stat.count

        if user:
            detail['user'] = self.user_manager.user_display_info(user)
            detail['user']['admin'] = user.is_super
            if user_hackathon:
                for uh in user_hackathon:
                    if uh.hackathon.id == hackathon.id:
                        detail['user']['admin'] = detail['user']['admin'] or (
                            uh.role == HACK_USER_TYPE.ADMIN)

                        if uh.like:
                            detail['like'] = uh.like

                        if uh.role == HACK_USER_TYPE.COMPETITOR:
                            detail['registration'] = uh.dic()
                            for t in team:
                                if t.hackathon.id == hackathon.id:
                                    detail['team'] = t.dic()
                                    break
                        break

        return detail

    def __create_hackathon(self, creator, context):
        """Insert hackathon and creator(admin of course) to database

        We enforce that default config are used during the creation

        :type context: Context
        :param context: context of the args to create a new hackathon

        :rtype: Hackathon
        :return hackathon instance
        """

        new_hack = Hackathon(
            name=context.name,
            display_name=context.display_name,
            ribbon=context.get("ribbon"),
            description=context.get("description"),
            short_description=context.get("short_description"),
            location=context.get("location"),
            banners=context.get("banners", []),
            status=HACK_STATUS.INIT,
            creator=creator,
            type=context.get("type", HACK_TYPE.HACKATHON),
            config=context.get("config", Context()).to_dict(),
            tags=context.get("tags", []),
            event_start_time=context.get("event_start_time"),
            event_end_time=context.get("event_end_time"),
            registration_start_time=context.get("registration_start_time"),
            registration_end_time=context.get("registration_end_time"),
            judge_start_time=context.get("judge_start_time"),
            judge_end_time=context.get("judge_end_time"))

        # basic xss prevention
        if new_hack.description:  # case None type
            new_hack.description = self.cleaner.clean_html(
                new_hack.description)
        new_hack.save()

        # add the current login user as admin and creator
        try:
            admin = UserHackathon(user=creator,
                                  hackathon=new_hack,
                                  role=HACK_USER_TYPE.ADMIN,
                                  status=HACK_USER_STATUS.AUTO_PASSED,
                                  remark='creator')
            admin.save()
        except Exception as ex:
            # TODO: send out a email to remind administrator to deal with this problems
            self.log.error(ex)
            raise InternalServerError(
                "fail to create the default administrator")

        return new_hack

    def __get_pre_allocate_interval(self, hackathon):
        interval = self.get_basic_property(
            hackathon, HACKATHON_CONFIG.PRE_ALLOCATE_INTERVAL_SECONDS)
        if interval:
            return int(interval)
        else:
            return 300 + random.random() * 50

    def __get_hackathon_configs(self, hackathon):
        def __internal_get_config():
            configs = {}
            for c in hackathon.configs.all():
                configs[c.key] = c.value
            return configs

        cache_key = self.__get_config_cache_key(hackathon)
        return self.cache.get_cache(key=cache_key,
                                    createfunc=__internal_get_config)

    def __get_hackathon_organizers(self, hackathon):
        organizers = self.db.find_all_objects_by(HackathonOrganizer,
                                                 hackathon_id=hackathon.id)
        return [o.dic() for o in organizers]

    def __parse_update_items(self, args, hackathon):
        """Parse properties that need to update

        Only those whose value changed items will be returned. Also some static property like id, name, create_time
        and unexisted properties should NOT be updated.

        :type args: dict
        :param args: arguments from http body which contains new values

        :type hackathon: Hackathon
        :param hackathon: the existing Hackathon object which contains old values

        :rtype: dict
        :return a dict that contains all properties that are updated.
        """
        result = {}

        hackathon_dic = hackathon.dic()
        for key in dict(args):
            if hasattr(hackathon,
                       key) and (key not in hackathon_dic
                                 or dict(args)[key] != hackathon_dic[key]):
                result[key] = dict(args)[key]

        result.pop('id', None)
        result.pop('name', None)
        result.pop('creator', None)
        result.pop('create_time', None)
        result['update_time'] = self.util.get_now()
        return result

    def __get_hackathon_stat(self, hackathon):
        stats = HackathonStat.objects(hackathon=hackathon).all()
        result = {"hackathon_id": str(hackathon.id), "online": 0, "offline": 0}
        for item in stats:
            result[item.type] = item.count

        reg_list = UserHackathon.objects(
            hackathon=hackathon,
            role=HACK_USER_TYPE.COMPETITOR,
            deleted=False,
            status__in=[
                HACK_USER_STATUS.AUTO_PASSED, HACK_USER_STATUS.AUDIT_PASSED
            ]).only("user").no_dereference().all()
        reg_list = [uh.user.id for uh in reg_list]
        reg_count = len(reg_list)
        if reg_count > 0:
            online_count = User.objects(id__in=reg_list, online=True).count()
            result["online"] = online_count
            result["offline"] = reg_count - online_count

        return result

    def __get_config_cache_key(self, hackathon):
        return "hackathon_config_%s" % hackathon.id

    def __create_default_data_for_local(self, hackathon):
        """
        create test data for new hackathon. It's for local development only
        :param hackathon:
        :return:
        """
        try:
            # test docker host server
            host = DockerHostServer(vm_name="localhost",
                                    public_dns="localhost",
                                    public_ip="127.0.0.1",
                                    public_docker_api_port=4243,
                                    private_ip="127.0.0.1",
                                    private_docker_api_port=4243,
                                    container_count=0,
                                    container_max_count=100,
                                    disabled=False,
                                    state=DockerHostServerStatus.DOCKER_READY,
                                    hackathon=hackathon)
            host.save()
        except Exception as e:
            self.log.error(e)
            self.log.warn("fail to create test data")
Esempio n. 28
0
from bs4 import BeautifulSoup
from bs4.element import Tag
import re
from readability import htmls

from extractor.util import load_log_config

logging.config.dictConfig(load_log_config())
logger = logging.getLogger('applog.' + __name__)

# Handle jQuery Lazy Load Plugin
IMAGE_URL_KEYS = ('src', 'data-lazy-src', 'data-original',)

cleaner = Cleaner(
    scripts=True, javascript=True, style=True, comments=True, forms=False,
    links=True, processing_instructions=True,
    kill_tags = ['footer', 'nav', 'select', 'button', 'noscript',],
)

class Parser():
    __REGEX_TITLE_ATTR = re.compile('title', re.IGNORECASE)
    __REGEX_NOT_TITLE_ATTR = re.compile('sub|side|related', re.IGNORECASE)

    def __init__(self, type_='lxml', html_string=''):
        if type_ == 'lxml':
            self._parser = LxmlParser(html_string)
        elif type_ == 'soup':
            self._parser = SoupParser(html_string)
        else:
            raise ValueError('parser type must be lxml or soup')
Esempio n. 29
0
def display_raw(html):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    html = cleaner.clean_html(html)
    return display_html(html, raw=True)
Esempio n. 30
0
class HtmlIngestor(DocumentIngestor):
    MIME_TYPES = ['text/html']
    EXTENSIONS = ['html', 'htm', 'asp', 'aspx', 'jsp']

    cleaner = Cleaner(scripts=True,
                      javascript=True,
                      style=True,
                      links=True,
                      embedded=True,
                      forms=True,
                      frames=True,
                      annoying_tags=True,
                      meta=False,
                      remove_tags=['a'])

    def generate_pdf_version(self, html_path):
        """OK, this is weirder. Converting HTML to PDF via WebKit."""
        fh, out_path = mkstemp(suffix='.pdf')
        os.close(fh)
        wkhtmltopdf = get_config('WKHTMLTOPDF_BIN')
        args = [
            wkhtmltopdf, '--disable-javascript', '--no-outline', '--no-images',
            '--quiet', html_path, out_path
        ]
        subprocess.call(args)
        return out_path

    def ingest(self, meta, local_path):
        fh, out_path = mkstemp(suffix='.htm')
        os.close(fh)
        with open(local_path, 'rb') as fh:
            data = fh.read()
        doc = html.fromstring(data)
        if not meta.has('title'):
            title = doc.findtext('.//title')
            if title is not None:
                meta.title = title.strip()

        if not meta.has('summary'):
            summary = doc.find('.//meta[@name="description"]')
            if summary is not None and summary.get('content'):
                meta.summary = summary.get('content')

        for field in ['keywords', 'news_keywords']:
            value = doc.find('.//meta[@name="%s"]' % field)
            if value is not None:
                value = value.get('content') or ''
                for keyword in value.split(','):
                    meta.add_keyword(keyword.strip())

        self.cleaner(doc)
        try:
            with open(out_path, 'w') as fh:
                fh.write(etree.tostring(doc))

            pdf_path = self.generate_pdf_version(out_path)
            if pdf_path is None or not os.path.isfile(pdf_path):
                raise IngestorException("Could not convert document: %r", meta)
            self.extract_pdf_alternative(meta, pdf_path)
        finally:
            if os.path.isfile(out_path):
                os.unlink(out_path)