Ejemplo n.º 1
0
def get_room_info(url):
    print('==========room')
    html = get_content(url)
    room_id_patt = r'"room_id":(\d{1,99}),'
    title_patt = r'<div class="headline clearfix">\s*<h1>([^<]{1,9999})</h1>'
    title_patt_backup = r'<title>([^<]{1,9999})</title>'

    roomid = match1(html, room_id_patt)
    title = match1(html, title_patt) or match1(html, title_patt_backup)
    title = unescape_html(title)

    conf = get_content("http://www.douyutv.com/api/client/room/" + roomid)
    metadata = json.loads(conf)
    print(metadata)
    servers = metadata['data']['servers']

    #  exit()
    print(servers)
    dest_server = servers[0]
    return {
        's_ip': dest_server['ip'],
        's_port': dest_server['port'],
        'rid': metadata['data']['room_id'].encode()
    }
    print('metadata', metadata)
    def test_escape_html(self):
        """Test escape HTML on selected codepoints."""
        test_input = test_common.ASCII_AND_SELECTED_CODEPOINTS

        want = (
            u'&#xfffd;\x01\x02\x03\x04\x05\x06\x07'
            u'\x08\t\n\x0B\x0C\r\x0E\x0F'
            u'\x10\x11\x12\x13\x14\x15\x16\x17'
            u'\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'
            u' !&#34;#$%&amp;&#39;()*&#43;,-./'
            u'0123456789:;&lt;=&gt;?'
            u'@ABCDEFGHIJKLMNO'
            u'PQRSTUVWXYZ[\]^_'
            u'&#96;abcdefghijklmno'
            u'pqrstuvwxyz{|}~\x7f'
            u'\u00A0\u0100\u2028\u2029\ufdec\ufeff\U0001D11E')

        got = escaping.escape_html(test_input)
        self.assertEquals(
            want, got, 'escaped:\n\t%r\n!=\n\t%r' % (want, got))
        want, got = u'\ufffd%s' % test_input[1:], html.unescape_html(got)
        self.assertEquals(
            want, got, 'reversible:\n\t%r\n!=\n\t%r' % (want, got))

        self.assertEquals('42', escaping.escape_html(42))
        self.assertEquals('', escaping.escape_html(None))
Ejemplo n.º 3
0
 def rel_links(cls, page):
   """return rel= links that should be scraped, skipping obviously data links."""
   for match in cls.REL_RE.finditer(page):
     href, rel = match.group(0), match.group(1)
     if rel not in cls.REL_TYPES:
       continue
     href_match = cls.HREF_RE.search(href)
     if href_match:
       href = cls.href_match_to_url(href_match)
       parsed_href = urlparse(href)
       if any(parsed_href.path.endswith(ext) for ext in cls.REL_SKIP_EXTENSIONS):
         continue
       yield unescape_html(href)
Ejemplo n.º 4
0
def get_room_info(url):
    print('==========room')
    html = get_content(url)
    room_id_patt = r'"room_id":(\d{1,99}),'
    title_patt = r'<div class="headline clearfix">\s*<h1>([^<]{1,9999})</h1>'
    title_patt_backup = r'<title>([^<]{1,9999})</title>'
    
    roomid = match1(html,room_id_patt)
    title = match1(html,title_patt) or match1(html,title_patt_backup)
    title = unescape_html(title)

    conf = get_content("http://www.douyutv.com/api/client/room/"+roomid)
    metadata = json.loads(conf)
    servers= metadata['data']['servers']
    dest_server= servers[0]
    return {'s_ip': dest_server['ip'],
            's_port': dest_server['port'],
            'rid': metadata['data']['room_id'].encode()
           }
    print(metadata)
Ejemplo n.º 5
0
def get_filename(htmlstring):
  return legitimize(unescape_html(htmlstring))
Ejemplo n.º 6
0
 def links(cls, page):
   """return all links on a page, including potentially rel= links."""
   for match in cls.HREF_RE.finditer(page):
     href = cls.href_match_to_url(match)
     yield unescape_html(href)
Ejemplo n.º 7
0
def strip_html(html):
	return unescape_html(TAG.sub('', html))
Ejemplo n.º 8
0
def get_filename(htmlstring):
    return legitimize(unescape_html(htmlstring))
def process_raw_text(raw_text, context):
    """
    raw_text - A chunk of HTML/CSS/JS.
    context - The context before raw_text.

    Returns (
      the context after raw_text which may be an error context,
      a normalized version of the text or None if an error occurred,
      None or the context immediately prior to the error,
      None or the unprocessed suffix of raw_text when the error occurred)

    May raise ContextUpdateFailure which is equivalent to returning
    STATE_ERROR but with a more informative error message.
    """

    normalized = StringIO.StringIO()

    while raw_text:
        prior_context, prior_raw_text = context, raw_text

        delim_type = delim_type_of(context)

        # If we are in an attribute value, then decode raw_text (except
        # for the delimiter) up to the next occurrence of delimiter.

        # The end of the section to decode.  Either before a delimiter
        # or > symbol that closes an attribute, at the end of the raw_text,
        # or -1 if no decoding needs to happen.

        attr_value_end = _end_of_attr_value(raw_text, delim_type)
        if attr_value_end == -1:
            # Outside an attribute value.  No need to decode.
            num_consumed, context, replacement_text = _process_next_token(
                raw_text, context)
            raw_text = raw_text[num_consumed:]
            normalized.write(replacement_text)

            if delim_type_of(context) == DELIM_SPACE_OR_TAG_END:
                # Introduce a double quote when we transition into an unquoted
                # attribute body.
                normalized.write('"')
        else:
            # Inside an attribute value.  Find the end and decode up to it.

            if delim_type == DELIM_SPACE_OR_TAG_END:
                # Check for suspicious characters in the value.
                # http://www.w3.org/TR/html5/tokenization.html
                # #attribute-value-unquoted-state
                # identifies [\0"'<=`] as transitions to error states.
                # If they occur in an unquoted value they are almost surely
                # an indication of an error in the template.
                bad = re.search(r'[\x00"\'<=`]', raw_text[:attr_value_end])
                if bad:
                    raise ContextUpdateFailure(
                        '%r in unquoted attr: %r'
                        % (bad.group(), raw_text[:attr_value_end]))

            # All of the languages we deal with (HTML, CSS, and JS) use
            # quotes as delimiters.
            # When one language is embedded in the other, we need to
            # decode delimiters before trying to parse the content in the
            # embedded language.

            # For example, in
            #       <a onclick="alert(&quot;Hello {$world}&quot;)">
            # the decoded value of the event handler is
            #       alert("Hello {$world}")
            # so to determine the appropriate escaping convention we decode
            # the attribute value before delegating to _process_next_token.

            # We could take the cross-product of two languages to avoid
            # decoding but that leads to either an explosion in the
            # number of states, or the amount of lookahead required.

            # The end of the attribute value.  At attr_value_end, or
            # attr_value_end + 1 if a delimiter needs to be consumed.
            if attr_value_end < len(raw_text):
                attr_end = attr_value_end + len(DELIM_TEXT[delim_type])
            else:
                attr_end = -1

            # Decode so that the JavaScript rules work on attribute values
            # like
            #     <a onclick='alert(&quot;{$msg}!&quot;)'>

            # If we've already processed the tokens "<a", " onclick='" to
            # get into the single quoted JS attribute context, then we do
            # three things:
            #   (1) This class will decode "&quot;" to "\"" and work below
            #       to go from STATE_JS to STATE_JSDQ_STR.
            #   (2) Then the caller checks {$msg} and realizes that $msg is
            #       part of a JS string.
            #   (3) Then, the above will identify the "'" as the end, and
            #       so we reach here with:
            #       r a w T e x t = " ! & q u o t ; ) ' > "
            #                                         ^ ^
            #                            attr_value_end attr_end

            # We use this example more in the comments below.

            attr_value_tail = html.unescape_html(raw_text[:attr_value_end])
            # attr_value_tail is "!\")" in the example above.

            if delim_type == DELIM_SINGLE_QUOTE:
                escaper = escaping.escape_html_sq_only
            else:
                escaper = escaping.escape_html_dq_only

            # Recurse on the decoded value.
            while attr_value_tail:
                num_consumed, context, replacement = _process_next_token(
                    attr_value_tail, context)
                attr_value_tail = attr_value_tail[num_consumed:]
                normalized.write(escaper(replacement))

            # TODO: Maybe check that context is legal to end an attr in.
            # Throw if the attribute ends inside a quoted string.

            if attr_end != -1:
                raw_text = raw_text[attr_end:]
                # raw_text is now ">" from the example above.

                # When an attribute ends, we're back in the tag.
                context = STATE_TAG | element_type_of(context)

                # Append the delimiter on exiting an attribute.
                if delim_type == DELIM_SINGLE_QUOTE:
                    normalized.write("'")
                else:
                    # Inserts an end quote for unquoted attributes.
                    normalized.write('"')
            else:
                # Whole tail is part of an unterminated attribute.
                if attr_value_end != len(raw_text):  # pragma: no cover
                    raise AssertionError()  # Illegal state.
                raw_text = ""
        if is_error_context(context):
            return context, None, prior_context, prior_raw_text
    return context, normalized.getvalue(), None, None
 def test_unescape_html(self):
     """
     Test unescape_html on corner cases like supplemental codepoints,
     re-escaping, broken escapes, etc.
     """
     self.assertEquals('', html.unescape_html(''))
     self.assertEquals('foo', html.unescape_html('foo'))
     self.assertEquals('foo<bar', html.unescape_html('foo&lt;bar'))
     self.assertEquals('foo< bar', html.unescape_html('foo&lt bar'))
     self.assertEquals('foo&amp;bar', html.unescape_html('foo&amp;amp;bar'))
     self.assertEquals('foo&bogus;bar', html.unescape_html('foo&bogus;bar'))
     self.assertEquals(
         u'>>>\u226b&gt;', html.unescape_html('&gt&gt;&GT;&Gt;&amp;gt;'))
     self.assertEquals(
         '""""', html.unescape_html('&#34;&#x22;&#X22;&quot;'))
     self.assertEquals(
         '<<<<', html.unescape_html('&#60;&#x3c;&#X3C;&lt;'))
     self.assertEquals(
         u'\u1234\u1234', html.unescape_html('&#4660;&#x1234;'))
     self.assertEquals(
         u'\uabcd\uabcd', html.unescape_html('&#43981;&#xabcd;'))
     self.assertEquals(
         u"\U0001D11E\U0001D11E",
         html.unescape_html('&#x1d11e;&#xd834;&#xdd1e;'))
     self.assertEquals("&#;&#gt;&#xxa0;", "&#;&#gt;&#xxa0;")
Ejemplo n.º 11
0
def get_filename(htmlstring):
    return filter_filename(unescape_html(htmlstring))
Ejemplo n.º 12
0
            file_contents = etree.tostring(tree,
                                           pretty_print=True).decode("utf8")
        except Exception as e:
            traceback.print_exc()
            print(
                "The clean-up script did not manage to fix all your XML issues",
                file=sys.stderr)
            print(
                'Try running "xmllint -noout" on the output file to get a more complete error report',
                file=sys.stderr)
            failed_files.append(filename)

        filename = os.path.join(output_dir, os.path.basename(filename))

        # Convert HTML entities:
        file_contents = unescape_html(file_contents)
        with open(filename, "w") as outfile:
            lines = file_contents.split("\n")
            outfile.write("\n".join(lines[1:-1]))

    try:
        print(total, file=sys.stderr)
    except UnicodeEncodeError:
        print(str(total).encode("utf-8"))

    if invalid_entities:
        print("##### Undeclared entities present in files.######",
              file=sys.stderr)
        print("To resolve, add definitions in this script", file=sys.stderr)
        for k, v in invalid_entities.items():
            print("\t\t".join([k, str(v)]))