def get_wiki_info(name):
	url = "http://ja.wikipedia.org/w/index.php?title={0}&action=edit".format(quote(name))
	data = get_page_with_cache("edit_"+name, url)
	inside_textarea = False
	lon = lat = None
	# for each line
	for line in data.splitlines():
		# make sure that we only analyze inside the textarea
		if inside_textarea==False:
			if line.find("<textarea")!=-1:
				inside_textarea = True
			continue
		line = line.strip()
		# after this mark, no interesting data
		if line.find("</textarea")!=-1:
			inside_textarea = False
			continue

		if line.find("{{aimai}}")!=-1:
			raise Exception("disambiguation page")

		# Detect Mediawiki tagging: |key = value
		if line.startswith("|")==False or line.find("=")==-1:
			continue
		# extract the mediawiki key and value from the line
		# which could be for example:
		# |よみがな = とうきょう
		try:
			key, val = line[1:].split("=", 1)
			key = key.strip()
			val = val.split("&lt;")[0].strip()
			val = parser.unescape(parser.unescape(val))
		except Exception as e:
			continue
		if key=="座標":
			# example: {{ウィキ座標2段度分秒|34|45|47.43|N|135|31|25.21|E|}}
			fields = val.split("|")
			if len(fields)>=7:
				lat = convert_coord(fields[1:4]) or lat
				lon = convert_coord(fields[5:8]) or lon
		# other coordinate tag which car contain lat or lon, or both
		elif key=="緯度度" or key=="経度度":
			pairs = dict(map(lambda x: x.strip(), p.split("=")) for p in line[1:].split("|"))
			lat_fields = (pairs.get("緯度度"), pairs.get("緯度分"), pairs.get("緯度秒"))
			lon_fields = (pairs.get("経度度"), pairs.get("経度分"), pairs.get("経度秒"))
			if None not in lat_fields:
				lat = convert_coord(lat_fields)
			if None not in lon_fields:
				lon = convert_coord(lon_fields)
	# if not lat or not lon:
	# 	print(name, lat, lon)
	return lat, lon
Exemple #2
0
    def set_request_cookie(self, user_id):
        """
        POST request for non-json content body
        :param user_id: userid used to send the request
        :type user_id:string
        """
        # Assumes password is always user_id + 1234
        password = user_id + '1234'

        # Make a request to Access the smarter URL for the purpose of being re directed to the OPEN AM page.
        self.send_request("GET", "/data")
        json_response = self._response.json()
        redirect = json_response['redirect']
        self.send_request("GET", redirect, use_base=False)
        # This should redirect us to IDP page. Extract the response message.
        response = self._response.content.decode('utf-8')

        # Search for regular expressions from the response body
        goto = re.search('name=\\"goto\\" value=\\"(.*)\\"', response).group(1)
        sun_query = re.search('name=\\"SunQueryParamsString\\" value=\\"(.*)\\"', response).group(1)
        self.set_request_header('content-type', 'application/x-www-form-urlencoded')
        # Get the LOGIN FORM. Compose a redirect PATH using the parameters extracted from the last GET request made to smarter
        # Submit the request to get the login form from IDP.
        request_data = {'goto': goto, 'SunQueryParamsString': sun_query, 'IDButton': 'Log In', 'gx_charset': 'UTF-8',
                        'encoded': 'true', 'IDToken1': user_id, 'IDToken2': password}

        # Send login request to IDP
        self.send_post(preferences(Default.idp), request_data)
        # Extract the response received from IDP
        response = self._response.content.decode('utf-8')
        # https: Submit the login information in the login form received from the from previous request a
        # and send a post request to smarter to get the cookie information
        parser = html.parser.HTMLParser()
        url = re.search('action=\\"(.*?)\\"', response).group(1)
        samlresponse = re.search('name=\\"SAMLResponse\\" value=\\"(.*?)\\"', response).group(1)
        relaystate = re.search('name=\\"RelayState\\" value=\\"(.*?)\\"', response).group(1)
        data = {'SAMLResponse': samlresponse, 'RelayState': relaystate}

        self.set_request_header('content-type', 'application/x-www-form-urlencoded')
        # unescape the strings
        url = parser.unescape(str(url))
        data['SAMLResponse'] = parser.unescape(str(data['SAMLResponse']))
        data['RelayState'] = parser.unescape(str(data['RelayState']))

        # Send post request
        self.send_post(url, data)
        response = self._response.content.decode('utf-8')

        # Get the cookie from response
        cookie_value = self._response.cookies
        self._request_header['cookies'] = cookie_value
Exemple #3
0
    def format_transcript_element(self, element, element_number):
        """
        Format transcript's element in order for it to be converted to WebVTT format.
        """
        sub_element = "\n\n"
        html_parser = HTMLParser()
        if element.tag == "text":
            start = float(element.get("start"))
            duration = float(element.get("dur", 0))  # dur is not mandatory
            text = element.text
            end = start + duration
            if text:
                formatted_start = self.format_transcript_timing(start)
                formatted_end = self.format_transcript_timing(end, 'end')
                timing = '{} --> {}'.format(formatted_start, formatted_end)
                text_encoded = text.encode('utf8', 'ignore')
                text = text_encoded.replace(b'\n', b' ')
                unescaped_text = unescape(text.decode('utf8'))
                sub_element = """\
                {element_number}
                {timing}
                {unescaped_text}

                """.format(element_number=element_number,
                           timing=timing,
                           unescaped_text=unescaped_text)
        return textwrap.dedent(sub_element)
Exemple #4
0
def clean(text):
    """
    A function for cleaning a string of text.
    Returns valid ASCII characters.
    """
    import sys, unicodedata

    text = clean_whitespace(text)

    # Remove links from message
    #text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # Replace HTML escape characters
    if sys.version_info[0] < 3:
        from HTMLParser import HTMLParser
        parser = HTMLParser()
        text = parser.unescape(text)
    else:
        import html.parser
        parser = html.parser.HTMLParser()
        text = parser.unescape(text)

    # Normalize unicode characters
    # 'raw_input' is just 'input' in python3
    if sys.version_info[0] < 3:
        text = unicode(text)
    text = unicodedata.normalize("NFKD", text).encode("ascii",
                                                      "ignore").decode("utf-8")

    return str(text)
Exemple #5
0
 def enrich_data(self, item_loader, response):
     item_loader.add_re("id", r'tid=(\d+)')
     title = "".join(response.xpath('//h4/text()').extract())
     item_loader.add_value("title", title)
     video_url = unescape("".join(
         response.selector.re(r"\.src='(.*)#iframeload'")))
     return [("video_url", item_loader, {"url": video_url})]
Exemple #6
0
    def run(self):
        """
        Checks through the submissions and archives and posts comments.
        """
        if not self._setup:
            raise Exception("Snapshiller not ready yet!")

        submissions = r.get_new(limit=self.limit)

        for submission in submissions:
            # Your crap posts aren't worth wasting precious CPU cycles and
            # archive.is and archive.org's bandwith. HAIL ELLEN PAO
            if submission.author and submission.author.name == "PoliticBot":
                log.info("Submisson by banned user; skipping.")
                continue
            log.debug("Found submission.\n" + submission.permalink)
            if not should_notify(submission):
                log.debug("Skipping.")
                continue
            archives = [ArchiveContainer(fix_url(submission.url),
                                         "*This Post*")]
            if submission.is_self and submission.selftext_html is not None:
                log.debug("Found text post...")
                links = BeautifulSoup(unescape(
                    submission.selftext_html)).find_all("a")
                if not len(links):
                    continue
                for anchor in links:
                    log.debug("Found link in text post...")
                    url = fix_url(anchor['href'])
                    archives.append(ArchiveContainer(url, anchor.contents[0]))
                    ratelimit(url)
            Notification(submission, self._get_header(submission.subreddit),
                         archives).notify()
            db.commit()
Exemple #7
0
def clean(text):
    """
    A function for cleaning a string of text.
    Returns valid ASCII characters.
    """
    import unicodedata
    import sys

    text = clean_whitespace(text)

    # Remove links from message
    # text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # Replace HTML escape characters
    if sys.version_info[0] < 3:
        from HTMLParser import HTMLParser

        parser = HTMLParser()
        text = parser.unescape(text)
    else:
        import html.parser

        parser = html.parser.HTMLParser()
        text = parser.unescape(text)

    # Normalize unicode characters
    # 'raw_input' is just 'input' in python3
    if sys.version_info[0] < 3:
        text = unicode(text)
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")

    return str(text)
Exemple #8
0
def parse_from_url(url: str) -> dict:
    """调用 newspaper 处理 url"""
    ret = Article(
        url,
        browser_user_agent=
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.16 Safari/537.36 Edg/79.0.309.15',
        request_timeout=30,
        keep_article_html=True)
    ret.download()  # 下载网页
    if ret.download_state == 2:
        ret.parse()  # 解析网页
        item = {
            'url': url,
            'title': ret.title,
            'keywords': ret.meta_keywords,
            'description': ret.meta_description,
            'author': ret.authors,
            'publishdate': str(ret.publish_date) if ret.publish_date else '',
            'content': ret.text,
            'content_html': re.sub(r'\r|\n|\t', '',
                                   unescape(ret.article_html)),
        }
        return item
    else:
        raise Exception(
            f'Page download error, download_state: {ret.download_state}')
    def run(self):
        """
        Checks through the submissions and archives and posts comments.
        """
        if not self._setup:
            raise Exception("Snapshill not ready yet!")

        submissions = self.reddit.front.new(limit=self.limit)

        for submission in submissions:
            debugTime = time.time()
            warned = False

            log.debug("Found submission.\n" + submission.permalink)

            if not should_notify(submission):
                log.debug("Skipping.")
                continue

            archives = [
                ArchiveContainer(fix_url(submission.url), submission.title)
            ]

            if submission.is_self and submission.selftext_html is not None:
                log.debug("Found text post...")

                links = BeautifulSoup(unescape(
                    submission.selftext_html)).find_all("a")

                finishedURLs = []

                for anchor in links:
                    if time.time() > debugTime + WARN_TIME and not warned:
                        log.warn(
                            "Spent over {} seconds on post (ID: {})".format(
                                WARN_TIME, submission.name))

                        warned = True

                    log.debug("Found link in text post...")

                    url = fix_url(anchor["href"])

                    if skip_url(url):
                        continue

                    if url in finishedURLs:
                        continue  # skip for sanity

                    archives.append(ArchiveContainer(url, anchor.contents[0]))
                    finishedURLs.append(url)
                    ratelimit(url)

            Notification(
                self.reddit,
                submission,
                self._get_header(submission.subreddit),
                archives,
            ).notify()
            db.commit()
Exemple #10
0
def crawl(urls, links=100):
    res = set()
    crawled = set()
    q = deque(urls)
    parser = html.parser.HTMLParser()
    while len(q) > 0 and len(crawled) < links:
        url = q.popleft()
        if url in crawled:
            continue
        try:
            req = requests.get(url)
        except:
            continue
        if req.status_code != requests.codes.ok:
            continue
        print(url, file=sys.stderr)
        req.encoding = 'utf-8'
        contents = parser.unescape(req.text)
        newLinks = page_links(contents)
        for l in newLinks:
            q.extend([l])
        newProxies = page_leech(contents)
        for p in newProxies:
            res.add(p)
        crawled.add(url)
    return list(res)
Exemple #11
0
def get_rte_components(html_string):
    """Extracts the RTE components from an HTML string.

    Args:
        html_string: str. An HTML string.

    Returns:
        list(dict). A list of dictionaries, each representing an RTE component.
        Each dict in the list contains:
        - id: str. The name of the component, i.e. 'oppia-noninteractive-link'.
        - customization_args: dict. Customization arg specs for the component.
    """
    parser = html.parser.HTMLParser()
    components = []
    soup = bs4.BeautifulSoup(html_string, 'html.parser')
    oppia_custom_tag_attrs = (
        rte_component_registry.Registry.get_tag_list_with_attrs())
    for tag_name in oppia_custom_tag_attrs:
        component_tags = soup.find_all(name=tag_name)
        for component_tag in component_tags:
            component = {'id': tag_name}
            customization_args = {}
            for attr in oppia_custom_tag_attrs[tag_name]:
                # Unescape special HTML characters such as '&quot;'.
                attr_val = parser.unescape(component_tag[attr])
                # Adds escapes so that things like '\frac' aren't
                # interpreted as special characters.
                attr_val = attr_val.encode('unicode_escape')
                customization_args[attr] = json.loads(attr_val)
            component['customization_args'] = customization_args
            components.append(component)
    return components
Exemple #12
0
    def typeAnsAnswerFilter(self, buf):
        if not self.typeCorrect:
            return re.sub(self.typeAnsPat, "", buf)
        origSize = len(buf)
        buf = buf.replace("<hr id=answer>", "")
        hadHR = len(buf) != origSize
        # munge correct value
        parser = html.parser.HTMLParser()
        cor = stripHTML(self.mw.col.media.strip(self.typeCorrect))
        # ensure we don't chomp multiple whitespace
        cor = cor.replace(" ", "&nbsp;")
        cor = parser.unescape(cor)
        cor = cor.replace("\xa0", " ")
        given = self.typedAnswer
        # compare with typed answer
        res = self.correct(given, cor, showBad=False)
        # and update the type answer area
        def repl(match):
            # can't pass a string in directly, and can't use re.escape as it
            # escapes too much
            s = """
<span style="font-family: '%s'; font-size: %spx">%s</span>""" % (
                self.typeFont, self.typeSize, res)
            if hadHR:
                # a hack to ensure the q/a separator falls before the answer
                # comparison when user is using {{FrontSide}}
                s = "<hr id=answer>" + s
            return s
        return re.sub(self.typeAnsPat, repl, buf)
Exemple #13
0
    def typeAnsAnswerFilter(self, buf):
        if not self.typeCorrect:
            return re.sub(self.typeAnsPat, "", buf)
        origSize = len(buf)
        buf = buf.replace("<hr id=answer>", "")
        hadHR = len(buf) != origSize
        # munge correct value
        parser = html.parser.HTMLParser()
        cor = self.mw.col.media.strip(self.typeCorrect)
        cor = re.sub("(\n|<br ?/?>|</?div>)+", " ", cor)
        cor = stripHTML(cor)
        # ensure we don't chomp multiple whitespace
        cor = cor.replace(" ", "&nbsp;")
        cor = parser.unescape(cor)
        cor = cor.replace("\xa0", " ")
        cor = cor.strip()
        given = self.typedAnswer
        # compare with typed answer
        res = self.correct(given, cor, showBad=False)

        # and update the type answer area
        def repl(match):
            # can't pass a string in directly, and can't use re.escape as it
            # escapes too much
            s = """
<span style="font-family: '%s'; font-size: %spx">%s</span>""" % (
                self.typeFont, self.typeSize, res)
            if hadHR:
                # a hack to ensure the q/a separator falls before the answer
                # comparison when user is using {{FrontSide}}
                s = "<hr id=answer>" + s
            return s

        return re.sub(self.typeAnsPat, repl, buf)
Exemple #14
0
def toggle_checkbox_ajax():
    with session_scope() as session:
        try:
            db_type = request.form['db_type']
            db_id = int(request.form['db_id'])
            item_number = int(request.form['item_number'][9:])
            parser = html.parser.HTMLParser()
            item_text = parser.unescape(request.form['item_text'])
            if db_type == 'Source':
                db_ob = db.Source
            elif db_type == 'Series':
                db_ob = db.Series
            elif db_type == 'Term':
                db_ob = db.Term
            else:
                return jsonify({"status" : "failure"})

            try:
                ob = session.query(db_ob).get(int(db_id))
                if not ob:
                    return jsonify({"status" : "failure"})
                ob.notes = utils.toggle_checkbox(ob.notes, item_number, item_text)
                session.commit()
                return jsonify({"status" : "success"})
            except:
                return jsonify({"status" : "failure"})
        except:
            session.rollback()
            return jsonify({"status" : "failure"})
Exemple #15
0
    def get_msg(self):

        if "none" not in self._response.url:
            try:
                self._selector = etree.HTML(self._response.content)
                self._lis = self._selector.xpath(
                    "//div[@class = 'basic-info cmn-clearfix']")
                self._name =  [unescape(x.xpath('string(.)')).strip().replace("\xa0","") \
                              for x in self._lis[0].xpath("//dt[@class = 'basicInfo-item name']")]
                self._value = [unescape(x.xpath('string(.)')).strip().replace("\xa0","") \
                              for x in self._lis[0].xpath("//dd[@class = 'basicInfo-item value']")]
                self.msg = dict(zip(self._name, self._value))
            except:
                self.msg = {}

        else:
            self.msg = {}
Exemple #16
0
def unescape(text):
    if (sys.version_info[0] < 3):
        print "1"
        parser = HTMLParser.HTMLParser()
    else:
        print "2"
        parser = html.parser.HTMLParser()
    return (parser.unescape(text))
Exemple #17
0
def render_resource(path, **context):
    """
    Render static resource using provided context.

    Returns: django.utils.safestring.SafeText
    """
    html = Template(resource_string(path))
    return html_parser.unescape(html.render(Context(context)))
Exemple #18
0
def create_song(raw):
    """
    创建歌曲类
    :param raw:
    :return:
    """
    parser = html.parser
    song = Song()
    song.title = parser.unescape(raw['songName'])
    song.artist = parser.unescape(raw['artist'])
    song.album_name = parser.unescape(raw['album_name'])
    song.song_id = raw['song_id']
    song.album_id = raw['album_id']
    song.location = raw['location']
    song.lyric_url = raw['lyric_url']
    song.pic_url = raw['pic']
    return song
def parse_html(html):
    data = html
    result_html = etree.HTML(data)
    items = result_html.xpath('//div[contains(@class,"userContentWrapper")]')
    result = ''
    for item in items:
        data = tostring(item, method='html')
        result = result + unescape(data.decode())
    return result
Exemple #20
0
def get_content():
    raw_text = wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '.main'))).text

    content = ''
    page_list = browser.find_elements_by_class_name('reader-txt-layer')
    for page in page_list:
        content += (unescape(page.text) + '\n')
    return content, raw_text
Exemple #21
0
def replaceHTMLCodes(txt):
    txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
    if version_info[0] == 3:
        txt = HTMLParser.unescape(txt)
    else:
        txt = HTMLParser.HTMLParser().unescape(txt)
    txt = txt.replace("&quot;", '"')
    txt = txt.replace("&amp;", "&")
    return txt
Exemple #22
0
def get_tv_series(url):
    sess.headers.update({
        'User-Agent': USER_AGENT,
        'Referer': HEJO_TV_BASE_URL,
    })

    html = get_url(url)
    xbmc.log("get_tv_series \n{}".format(html), level=xbmc.LOGINFO)
    dom = HtmlDom().createDom(html)

    series_list = dom.find('div.ml-item')
    xbmc.log("get_tv_series, found {} series".format(series_list.len),
             level=xbmc.LOGINFO)

    for series in series_list:
        xbmc.log("{}\n\n\n\n".format(series.html()), level=xbmc.LOGINFO)
        series_url = series.find('a.ml-mask').attr('href')
        series_title = unescape(series.find('a.ml-mask').attr('title'))
        series_poster = "{}{}".format(
            HEJO_TV_BASE_URL,
            series.find('img.thumb').first().attr('src'))
        series_description = unescape(
            series.find('div#hidden_tip').first().text().strip())

        series_info = {
            'plot': series_description,
            'title': series_title,
        }

        xbmc.log("{}, {}, {}".format(series_url, series_poster,
                                     series_description),
                 level=xbmc.LOGINFO)

        add_item(name=series_title,
                 url=series_url,
                 mode='get_tv_stream',
                 image=series_poster,
                 folder=False,
                 isplay=False,
                 infoLabels=series_info,
                 itemcount=series_list.len)

    xbmcplugin.setContent(addon_handle, 'videos')
    xbmcplugin.endOfDirectory(addon_handle, True)
Exemple #23
0
    def run(self):
        """
        Checks through the submissions and archives and posts comments.
        """
        if not self._setup:
            raise Exception("Snapshiller not ready yet!")

        submissions = r.get_new(limit=self.limit)

        for submission in submissions:
            debugTime = time.time()
            warned = False

            log.debug("Found submission.\n" + submission.permalink)

            if not should_notify(submission):
                log.debug("Skipping.")
                continue

            archives = [ArchiveContainer(fix_url(submission.url),
                                         "*This Post*")]
            if submission.is_self and submission.selftext_html is not None:
                log.debug("Found text post...")

                links = BeautifulSoup(unescape(
                    submission.selftext_html)).find_all("a")

                if not len(links):
                    continue

                finishedURLs = []

                for anchor in links:
                    if time.time() > debugTime + WARN_TIME and not warned:
                        log.warn("Spent over {} seconds on post (ID: {})".format(
                            WARN_TIME, submission.name))

                        warned = True

                    log.debug("Found link in text post...")

                    url = fix_url(anchor['href'])

                    if skip_url(url):
                        continue

                    if url in finishedURLs:
                        continue #skip for sanity

                    archives.append(ArchiveContainer(url, anchor.contents[0]))
                    finishedURLs.append(url)
                    ratelimit(url)

            Notification(submission, self._get_header(submission.subreddit),
                         archives).notify()
            db.commit()
Exemple #24
0
def about():
    hot_posts = _get_hot()
    tags = _get_tags()
    post = Post.query.filter(Post.category_id == 0).first_or_404()
    post.content = unescape(post.content)
    return render_template('about.html',
                           title='关于作者',
                           hot=hot_posts,
                           tags=tags,
                           post=post)
Exemple #25
0
def get_content():
    content = ''
    page_list = browser.find_elements_by_xpath('//div[contains(@id,"pageNo")]')
    for page in page_list:
        content += (unescape(page.text) + '\n')
        print(page.text)
    content = content.splitlines()
    content = ''.join(content)
    content = re.sub(r'\n\t\f\r', content)
    return content
Exemple #26
0
 def render(self, email_msg, context=None):
     if self.hydrated_template is None:
         sections_text = ''.join(self.sections)
         self.hydrated_template = Template(HtmlEmailTemplate.base_template.render({"content": sections_text}))
     email_msg.content_subtype = "html"
     # For some reason xml markup characters in the template (<,>) get converted to entity codes (&lt; and &rt;)
     # We unescape to convert the markup characters back
     _context = context or {}
     _context['use_signature'] = self.use_signature
     email_msg.body = unescape(self.hydrated_template.render(Context(_context)))
     return email_msg
Exemple #27
0
 def get_title(cls, data):
     parser = cls(convert_charrefs=True)
     try:
         parser.feed(data)
     except:  # many bugs lol
         return None
     title = parser.title
     if title is None:
         return None
     title = title.strip()
     if title:
         return re_space.sub(' ', unescape(title))
Exemple #28
0
    def get_mul(self):

        self._mul = self._selector.xpath(
            "//ul[@class ='polysemantList-wrapper cmn-clearfix']")
        if self._mul:
            self.other_type = [ unescape(x.xpath('string(.)')).strip().replace("\xa0","")\
                                for x in self._mul[0].xpath("//li[@ class='item']//a")]
            self._type = self._mul[0].xpath(
                "//li[@ class='item']//span")[0].text
        else:
            self._mul = []
            self._type = ""
Exemple #29
0
def render_template(template_name, **context):
    """
    Render static resource using provided context.

    Returns: django.utils.safestring.SafeText
    """
    template_dirs = [os.path.join(os.path.dirname(__file__), 'static/html')]
    libraries = {'i18n': 'django.templatetags.i18n'}
    engine = Engine(dirs=template_dirs, debug=True, libraries=libraries)
    html = engine.get_template(template_name)

    return html_parser.unescape(html.render(Context(context)))
Exemple #30
0
 def decode_html(value):
     """
     Descodifica las HTML entities
     @param value: valor a decodificar
     @type value: str
     """
     try:
         unicode_title = unicode(value, "utf8", "ignore")
         return unescape(unicode_title).encode("utf8")
     except:
         if PY3 and isinstance(value, bytes):
             value = value.decode("utf8")
         return value
Exemple #31
0
def get_bycategory(cid, page_num):
    if not page_num or int(page_num) < 1:
        page_num = 1
    paginate = Post.query.order_by(Post.post_time.desc()).filter_by(stype=1).filter_by(category_id=cid). \
        filter_by(status=1).paginate(int(page_num), gl.index_page_limit, True)
    posts = paginate.items
    for p in posts:
        p.comment_counts = 0
        p.time = time.strftime('%Y-%m-%d', time.localtime(p.post_time))
        p.content = unescape(p.content)
    hot = _get_hot()
    tags = _get_tags()
    return render_template('type.html', title='分类', posts=posts, pagination=paginate, cid=cid, hot=hot, tags=tags)
Exemple #32
0
    def run(self):
        """
        Checks through the submissions and archives and posts comments.
        """

        subreddit1 = reddit.subreddit(working_sub)

        for submission in subreddit1.stream.submissions():
            if submission.id not in posts_replied_to:
                posts_replied_to.append(submission.id)
                debugTime = time.time()
                warned = False
                with open("drama_posts_replied_to.txt", "a") as posts:
                    posts.write("{}\n".format(submission.id))
                    posts.close()
                log.info("Found submission.: {}".format(submission.permalink))

                archives = [ArchiveContainer(fix_url(submission.url),"*This Post*")]
                if submission.is_self and submission.selftext_html is not None:
                    log.debug("Found text post...")

                    links = BeautifulSoup(unescape(
                        submission.selftext_html)).find_all("a")

                    if not len(links):
                        continue

                    finishedURLs = []

                    for anchor in links:
                        if time.time() > debugTime + WARN_TIME and not warned:
                            log.warn("Spent over {} seconds on post (ID: {})".format(WARN_TIME, submission.name))
                            warned = True

                        log.debug("Found link in text post...")

                        url = fix_url(anchor['href'])

                        if skip_url(url):
                            continue

                        if url in finishedURLs:
                            continue #skip for sanity

                        archives.append(ArchiveContainer(url, anchor.contents[0]))
                        finishedURLs.append(url)
                        ratelimit(url)
                        time.sleep(50)

                Notification(submission, archives).notify()
                time.sleep(12)
Exemple #33
0
def get_sourcecode(url):
    try:
        ssl._create_default_https_context = ssl._create_unverified_context
        r = urllib.request.Request(
            url,
            headers={
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'
            })
        r_source = urllib.request.urlopen(r)
        r_source_read = str(r_source.read())
        return unescape(r_source_read)
    except:
        return None
Exemple #34
0
    def _get_quotes(wiki_page):
        # Remove remaining escape characters from wiki content
        quotes = unescape(wiki_page.content_md)

        # Remove comment lines starting with # or ; including any leading whitespace
        quotes = re.sub('^[ \t]*[#;].*$', '', quotes, flags=re.MULTILINE)

        # Split and strip the quotes into an array using --- as a delimiter
        quotes = [quote.strip() for quote in quotes.split('---')]

        # Remove any blank quotes
        quotes = [quote for quote in quotes if quote]

        return quotes
Exemple #35
0
def index():
    page_num = request.args.get('page_num')
    if not page_num or int(page_num) < 1:
        page_num = 1
    paginate = Post.query.filter_by(status=1).filter_by(stype=1).filter(Post.category_id > 0). \
        order_by(Post.post_time.desc()).paginate(int(page_num), gl.index_page_limit, True)
    posts = paginate.items
    for p in posts:
        p.comment_counts = 0
        p.time = str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(p.post_time))[0:-9])
        p.content = unescape(p.content)
    hot_posts = _get_hot()
    tags = _get_tags()
    return render_template('home.html', title='首页', posts=posts, pagination=paginate, hot=hot_posts, tags=tags)
def rankings():
    last_updated = update_check()
    postition = request.form["action"]
    print(postition)
    player = [{'rank':.5}]
    player+=get_all_players(postition)
    player.append({'rank':len(player)})
    test = '['
    for item in player:
        test+=(json.dumps(item)) + ', '
    test += ']'
    print(test)
    test = parser.unescape(test)
    return render_template('rankings.html', updated=last_updated, type=postition, data=test)
Exemple #37
0
def getNews(symbol):
    url = buildNewsUrl(symbol)

    content = urlopen(url).read().decode('utf-8')

    content_json = demjson.decode(content)

    article_json = []
    news_json = content_json['clusters']
    for cluster in news_json:
        for article in cluster:
            if article == 'a':
                article_json.extend(cluster[article])

    return [[unescape(art['t']).strip(), art['u']] for art in article_json]
Exemple #38
0
 def format_transcript_text(self, text):
     """
     Prepare unescaped transcripts to be converted to WebVTT format.
     """
     new_text = [
         self.format_transcript_text_line(line)
         for line in text[0].splitlines()
     ]
     new_text = '\n'.join(new_text)
     unescaped_text = html_parser.unescape(new_text)
     if "WEBVTT" not in text:
         text = "WEBVTT\n\n" + unescaped_text
     else:
         text = unescaped_text
     return text
Exemple #39
0
def test_reset_proceed_wrong_confirm(user, db_session, default_app):
    """Reset test for reseting pasword with notmatched passwords."""
    user = db_session.merge(user)
    user.set_reset()
    transaction.commit()

    user = db_session.merge(user)
    res = default_app.get("/password/reset/" + user.reset_key)

    res.form["password"] = NEW_PASSWORD
    res.form["confirm_password"] = NEW_PASSWORD + "Typo"
    res = res.form.submit()

    assert "Error! Password doesn't match" in unescape(
        res.body.decode("unicode_escape"))
def test_register_error(db_session, default_app, email, password,
                        confirm_password, error):
    """Error in registration process."""
    assert db_session.query(User).count() == 0

    res = default_app.get("/register")
    if email is not None:
        res.form["email"] = email
    res.form["password"] = password
    res.form["confirm_password"] = confirm_password
    res = res.form.submit(extra_environ={"REMOTE_ADDR": "0.0.0.0"})
    transaction.commit()

    assert error in unescape(res.body.decode("unicode_escape"))
    assert db_session.query(User).count() == 0
Exemple #41
0
def get_detail(pid):
    post = Post.query.filter_by(id=pid).filter_by(status=1).filter_by(stype=1).filter(Post.category_id > 0). \
        first_or_404()
    post.view_counts += 1
    db.session.flush()

    pre_post = Post.query.order_by(Post.id.desc()).filter_by(status=1).filter_by(stype=1).filter(Post.id < pid).first()
    next_post = Post.query.order_by(Post.id.asc()).filter(Post.id > pid).filter_by(status=1).filter_by(stype=1).first()
    post.time = str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(post.post_time)))
    post.comment_counts = 0
    post.content = unescape(post.content)
    hot = _get_hot()
    tags = _get_tags()

    _store_visitors(pid)
    return render_template('detail.html', title=post.title, p=post, pre_post=pre_post, next_post=next_post, hot=hot,
                           tags=tags)
Exemple #42
0
    def _post_snapshots(self, post):
        link_list = ""
        this_post = ""

        logging.debug("Fetching archive link for submission {0}: {1}".format(post.id, "http://redd.it/" + post.id))

        try:
            if post.is_self and post.selftext_html is not None:
                soup = BeautifulSoup(unescape(post.selftext_html))
                for anchor in soup.find_all('a'):
                    url = anchor['href']
                    netloc = urllib.parse.urlparse(url)[1]
                    if netloc == '':
                        netloc = 'reddit.com'
                        url = "http://www.reddit.com" + urllib.parse.urlparse(url)[2]
                    if netloc in self.config['domains'] or 'all' in self.config['domains']:
                        archive_link = self._get_archive_url(self._fix_reddit_url(url))
                        link_list += "* [{0}...]({1})\n\n".format(anchor.contents[0][0:randint(35, 40)], archive_link)

            elif not post.is_self:
                archive_link = self._get_archive_url(self._fix_reddit_url(post.url))
                link_list = "* [Link]({0})\n".format(archive_link)

            this_post = self._get_archive_url("http://redd.it/" + post.id)

        except KeyboardInterrupt as e:
            logging.error("Error fetching archive link on submission {0}: {1}".format(post.id,
                                                                                      "http://redd.it/" + post.id))
            logging.error(str(e))
            pass

        quote = self._get_quote()

        try:
            if not post.archived:
                logging.info("Posting snapshot on submission {0}: {1}".format(post.id,
                                                                              "http://redd.it/" + post.id))
                post.add_comment(self.post_comment.format(quote=quote,
                                                          this_post=this_post,
                                                          links=link_list,
                                                          subreddit=self.config['bot_subreddit']))
            self.post_archive.add(post.id)
        except Exception as e:
            logging.error("Error adding comment on submission {0}: {1}"
                          .format(post.id, "http://redd.it/" + post.id))
            logging.error(str(e))
Exemple #43
0
    def process(self):
        report = self.receive_message()

        raw_report = utils.base64_decode(report.get("raw"))
        raw_report_splitted = raw_report.split("</tr>")[2:]

        parser = html.parser.HTMLParser()

        for row in raw_report_splitted:
            event = Event(report)

            row = row.strip()

            if len(row) <= 0:
                continue

            info = row.split("<td>")
            if len(info) < 3:
                continue

            ip = info[1].split('</td>')[0].strip()
            last_seen = info[2].split('</td>')[0].strip() + '-05:00'
            description = parser.unescape(info[3].split('</td>')[0].strip())

            for key in ClassificationType.allowed_values:
                if description.lower().find(key.lower()) > -1:
                    event.add("classification.type", key)
                    break
            else:
                for key, value in TAXONOMY.items():
                    if description.lower().find(key.lower()) > -1:
                        event.add("classification.type", value)
                        break

            if not event.contains("classification.type"):
                event.add("classification.type", 'unknown')

            event.add("time.source", last_seen)
            event.add("source.ip", ip)
            event.add("event_description.text", description)
            event.add("raw", row)

            self.send_message(event)
        self.acknowledge_message()
  def __init__(self, url, numCols, extractionMap, exceptions):
    # Request the html.
    request = urllib.request.Request(url)
    request.add_header("User-Agent",self.user_agent)
    try:
      response = urllib.request.urlopen(request)
    except:
      print("Error: Invalid URL. Exiting.")
      exit()
    htmlContent = response.read().decode("utf8")

    # Some files have <br> in the middle of a <td> tag,
    # and cause the parser to misinterpret the data.
    htmlContent = htmlContent.replace("<br>", "")

    # Parse the html.
    parser = CountryParser(numCols, extractionMap, exceptions, strict=False)
    htmlContent = parser.unescape(htmlContent) # Unescape HTML entities.
    parser.feed(htmlContent)
    parser.close()
    self.__countryData = parser.countryData
Exemple #45
0
def get_skills(cls):
	r = s.get('http://bddatabase.net/query.php?a=skills&type=%s&l=us' % cls)
	
	data = []
	for entry in r.json()['aaData']:
		id = int(entry[0])
		original_name = bold_pattern.search(entry[2]).group(1)
		level = int(entry[3])

		name = parser.unescape(original_name).strip()
		name = name \
			.replace(': ', '_') \
			.replace(' - ', "_") \
			.replace("'", "") \
			.replace(' ', "_") \
			.upper()

		name = re.sub(r'[^a-zA-Z\d\s_]', '', name)

		skill_number_match = skill_number_pattern.search(name)
		if skill_number_match:
			skill_number = skill_number_match.group()[1:]
			try:
				skill_number = int(skill_number)
			except:
				try:
					skill_number = roman.fromRoman(skill_number)
				except:
					print(repr(original_name))
					raise
			name_without_skill_number = name[:-(skill_number_match.end() - skill_number_match.start())]
		else:
			skill_number = None
			name_without_skill_number = name

		print('%s id %i skillnumber %s' % (name_without_skill_number.encode('cp850', errors='replace'), id, skill_number))
		data.append([name, name_without_skill_number, id, skill_number])

	return data
Exemple #46
0
def parse_playlist(playlist):
    data = json.loads(playlist)

    if not data['status']:
        return []
        
    # trackList would be `null` if no tracks
    track_list = data['data']['trackList']
    if not track_list:
        return []

    parser = html.parser.HTMLParser()

    return [
        {
            key: parser.unescape(track[key])
            for key in [
                'title', 'location', 'lyric', 'pic', 'artist', 'album_name',
                'song_id', 'album_id'
            ]
        }
        for track in track_list
    ]
Exemple #47
0
    def parse_output(self, line):
        # Capture SKOOTs
        if line.find('SKOOT') != -1:
            self.parse_skoot(line)
        else:
            parser = html.parser.HTMLParser()
            line = parser.unescape(line)
            # Before we nuke the HTML closing tags, decide if we need to un-nest some lists.
            if self.list_depth > 0:
                self.list_depth -= line.count('</ul>')
                # pprint('List depth now lowered to: ' + str(self.list_depth))
            line = re.sub(r"</.*?>", "", line)
            tags = []
            self.draw_output("\n")

            # line is now a string with HTML opening tags.
            # Each tag should delineate segment of the string so that if removed the resulting string
            # would be the output line.

            # It can be a subset of (antiquated) HTML tags:
            # center, font, hr, ul, li, pre, b
            pattern = re.compile(r'<(.*?)>')
            segments = pattern.split(line)
            if segments.__len__() > 1:
                for segment in segments:
                    segment = segment.strip('<>')
                    # Not sure if more Pythonic to do this or a dictionary of functions
                    if re.search(r'thinks aloud:', segment):
                        # Just a thought, print it!
                        self.draw_output('<' + segment + '>', tuple(tags))
                    elif re.match(r'font', segment):
                        # Handle font changes
                        # So far I know of size and color attributes.
                        color = re.match(r'font color="(#[0-9a-fA-F]{6})"', segment)
                        if color:
                            color = color.group(1)
                            self.output_panel.tag_configure(color, foreground=color,
                                                            font=self.output_panel.cget("font"))
                            tags.append(color)
                            # @todo Handle sizes
                    elif re.match(r'hr', segment):
                        i = 0
                        line = ''
                        while i < self.line_length:
                            line += '-'
                            i += 1
                        self.draw_output(line, 'center')
                    elif re.match(r'pre', segment):
                        # For now, we're just handling this as centered because our font is already fixed width.
                        tags.append('center')
                    elif re.match(r'center', segment):
                        tags.append('center')
                    elif re.match(r'b', segment):
                        tags.append('bold')
                    elif re.match(r'ul', segment):
                        self.list_depth += 1
                        # pprint('List depth now raised to: ' + str(self.list_depth))
                        segment.replace('ul', '')
                        if re.match(r'li', segment):
                            segment = segment.replace('li', self.draw_tabs() + "* ")
                            self.draw_output(segment, tuple(tags))
                    elif re.match(r'li', segment):
                        segment = segment.replace('li', self.draw_tabs() + "* ")
                        self.draw_output(segment, tuple(tags))
                    else:
                        # Not a special segment
                        self.draw_output(segment, tuple(tags))
            else:
                self.draw_output(line, None)
Exemple #48
0
        s.auth = HttpNtlmAuth('INTRA\\ruijie.yang', pw, s)
        ret01 = s.get(url06, params=payload01)
        if '<title>Working...' in ret01.text:
            break
        else:
            print('Authentication failed, please try again')
        # ret01 = s.get(url02)
        # payload01['wctx'] = ret01.text.split('wctx=')[1].split('\\u0026')[0]
        # ret01 = s.get(url01)
        # print(ret01.text)
        # if '[200]' in ret01.text:
        #     break
        # else:
        #     print('Authentication failed, please try again')
    # payload02['wctx'] = payload01['wctx']
    payload02['wresult'] = parser.unescape(ret01.text.split('name="wresult" value="')[1].split('" />')[0])
    payload02['wctx'] = parser.unescape(ret01.text.split('name="wctx" value="')[1].split('" />')[0])
    ret02 = s.post(url02, data=payload02)
    # print(ret02.text)
    payload03['t'] = ret02.text.split('value="')[1].split('">')[0]
    url03 = ret02.text.split('action="')[1].split('" ')[0]
    ret03 = s.post(url03, data=payload03)
    payload04['t'] = ret03.text.split('value="')[1].split('">')[0]
    ret04 = s.post(url04, data=payload04)
    ret9 = s.get(url9)
    payload05['t'] = ret9.text.split('value="')[1].split('">')[0]
    
    ret9 = s.post(url05, data=payload05)
    lines = ret9.text.replace('\r\n', '\n').split('\n')
    parseDish(s, 9, lines, dishes, id2pic)
Exemple #49
0
def format_text(text):
    return parser.unescape(text).strip()
Exemple #50
0
def main():
    """main function"""
    if len(sys.argv) < 2:
        print("Usage: auto script")
        exit(1)

    payload01 = {
        "username": "******",
        "wa": "wsignin1.0",
        "wtrealm": "urn:federation:MicrosoftOnline",
        "popupui": "",
    }
    payload02 = {"wa": "wsignin1.0"}
    payload03 = {"wa": "wsignin1.0"}
    payload04 = {}
    payload05 = {}
    pw = getpass.getpass()

    dishes = []
    id2pic = {}

    parser = html.parser.HTMLParser()

    with requests.session() as sess:
        sess.auth = HttpNtlmAuth("INTRA\\ruijie.yang", pw, sess)
        sess.headers.update(
            {
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "en-US,en;q=0.8,ja;q=0.6,zh-CN;q=0.4,zh;q=0.2",
                "Cache-Control": "max-age=0",
                "Connection": "keep-alive",
                "Content-Type": "application/x-www-form-urlencoded",
                "Upgrade-Insecure-Requests": "1",
            }
        )
        # ret01 = sess.get(URL02)
        # payload01['wctx'] = ret01.text.split('wctx=')[1].split('\\u0026')[0]
        ret01 = sess.get(URL01)
        print(ret01.text)
        payload01[
            "wctx"
        ] = "estsredirect=2&estsrequest=rQIIAbPSySgpKSi20tcvyC8qSczRy81MLsovzk8ryc_LycxL1UvOz9XLL0rPTAGxioS4BBYVHnL4-T7Qcc-kppq1-SXbVjEqEzZC_wIj4wtGxltMgv5F6Z4p4cVuqSmpRYklmfl5j5h4Q4tTi_zzcipD8rNT8yYx8-Xkp2fmxRcXpcWn5eSXAwWAJhQkJpfEl2QmZ6eW7GJWSTYySDEwSkvSTTQ2T9M1MTU30LUwMjXXNUlLNrBINkhMTTZJu8AicICTEQA1"
        # payload02['wctx'] = payload01['wctx']
        ret01 = sess.get(URL06, params=payload01)
        payload02["wresult"] = parser.unescape(ret01.text.split('name="wresult" value="')[1].split('" />')[0])
        payload02["wctx"] = parser.unescape(ret01.text.split('name="wctx" value="')[1].split('" />')[0])
        ret02 = sess.post(URL02, data=payload02)
        # print(ret02.text)
        payload03["t"] = ret02.text.split('value="')[1].split('">')[0]
        ret03 = sess.post(URL03, data=payload03)
        payload04["t"] = ret03.text.split('value="')[1].split('">')[0]
        ret04 = sess.post(URL04, data=payload04)
        ret9 = sess.get(URL9)
        payload05["t"] = ret9.text.split('value="')[1].split('">')[0]

        ret9 = sess.post(URL05, data=payload05)
        lines = ret9.text.replace("\r\n", "\n").split("\n")
        parse_dish(sess, 9, lines, dishes, id2pic)

        ret22 = sess.get(URL22)
        lines = ret22.text.replace("\r\n", "\n").split("\n")
        parse_dish(sess, 22, lines, dishes, id2pic)
        download_pics(sess, dishes, id2pic)
        # for x in INGD_LST:
        #     ingdfile = sess.get(URL_INGD % x, stream=True)
        #     with open('static/images/%s.jpg' % x, 'wb') as out_file:
        #         shutil.copyfileobj(ingdfile.raw, out_file)
        #     del ingdfile

    dish_json = json.dumps([vars(x) for x in dishes])
    # print(dish_json)
    # print(id2pic)

    ret = requests.post(URL_UPDATE, data=dish_json)
    print(ret.text)
Exemple #51
0
def unescape(text):
    if (sys.version_info[0] < 3):
        parser = HTMLParser.HTMLParser()
    else:
        parser = html.parser.HTMLParser()
    return (parser.unescape(text))
Exemple #52
0
def about():
    hot_posts = _get_hot()
    tags = _get_tags()
    post = Post.query.filter(Post.category_id == 0).first_or_404()
    post.content = unescape(post.content)
    return render_template('about.html', title='关于作者', hot=hot_posts, tags=tags, post=post)
Exemple #53
0
def unescape(text):
    parser = html.parser.HTMLParser()
    return (parser.unescape(text))
Exemple #54
0
        homoplasy += sum(C.values())
        print(idx+1, pr[1], homoplasy, sum(C.values()))
    print('TOTAL', '{0:.2}'.format(homoplasy / len(data['protos'])))
    
    with open('R_sound-change-frequencies-'+matrix+'.tsv', 'w') as f:
        f.write('SOURCE\tTARGET\tFREQUENCY\n')
        for (s,t),v in sorted(H.items(), key=lambda x: x[1], reverse=True):
            f.write('{0}\t{1}\t{2}\n'.format(s,t,v))
    
    G = nx.DiGraph()
    for a,b in H:
        G.add_edge(a,b,weight=H[a,b])
    nx.write_gml(G,'.tmp.gml')
    tmp = open('.tmp.gml').read()
    with open('R_scf-'+matrix+'.gml', 'w') as f:
        f.write(parser.unescape(tmp))

if 'proto' in argv:
    if not tree:
        raise ValueError("No tree specified!")
    
    C = {}
    for idx,(p,m,c,pr) in enumerate(zip(data['patterns'], data[matrix], 
        data['chars'] if matrix != 'fitch' else data['fitch.chars'],
            data['protos'])):
        

        w,p,r = sankoff_parsimony(
                p,
                data['taxa'],
                tree,
Exemple #55
0
    
    # check statistics, for example, get
    
    C = G.community_infomap(
            edge_weights = 'woccurrence',
            vertex_weights = 'occurrence'
            )
    for community,name in zip(C.membership, G.vs['name']):
        _G.node[name]['infomap'] = community

    print('[i] Calculated communities for rhyme words.')



from html import parser
nx.write_gml(N, 'R_rime_transitions.gml')
with open('R_rime_transitions.gml') as f:
    _t = f.read()
with open('R_rime_transitions.gml','w') as f:
    f.write(parser.unescape(_t))

nx.write_gml(_G, 'R_infomap.gml')
with open('R_infomap.gml') as f:
    _t = f.read()
with open('R_infomap.gml','w') as f:
    f.write(parser.unescape(_t))
nx.write_yaml(_G, 'R_infomap.yaml')



Exemple #56
0
def unescape(message):
    parser = html.parser.HTMLParser()
    return parser.unescape(message)
            self.lyric_data = self.getpos()
            self.start_lyric_set = False
            self.lyric_data_set = True

    def handle_endtag(self, tag):
        if self.lyric_data_set:
            self.lyric_data_set = False
            self.end_lyric = self.getpos()


if __name__ == "__main__":
    page = urllib.request.urlopen("http://lyrics.wikia.com/Radiohead:Separator")
    parser = LyricParser()
    giant_html_string = ""
    for line in page:
        giant_html_string += parser.unescape((fix(line)))

    parser.feed(giant_html_string)

    print(parser.start_lyric)
    print(parser.lyric_data)
    print(parser.end_lyric)

    # print(giant_html_string[parser.start_lyric[0]:parser.lyric_data[0]])
    i = 0
    for line in giant_html_string.split("\n"):
        if i >= parser.start_lyric[0] and i <= parser.lyric_data[0]:
            print(line)
        i += 1
    print(i)
    # page.seek(parser.start_lyric[0])