Python HTMLParser Beispiele, lxml.etree.HTMLParser Python Beispiele

Beispiel #1

0

Datei anzeigen

fname_pixiv_parttern = "{user} - {title} ({work_id}@{user_id})[{tags}]{{{tools}}}"

dropboxed = True
opt_tagFile = opt_renameFile = False


import os, sys, re, time, io
import urllib, json
from optparse import OptionParser
from PIL import Image
import lxml.html as lhtml
from lxml.etree import HTMLParser


utf8Parser = HTMLParser(encoding="utf-8")

defaultFilter = re.compile('.*\..*p.*g')
dir_slash = '/'

def clearifyText(s):
	s = s.replace('\n','')
	s = ' '.join([t for t in s.split(' ') if t])
	s = '\t'.join([t for t in s.split('\t') if t])
	return s

def getFileList(path, filter=defaultFilter):
	r = []
	for file in os.listdir(path):
		if os.path.isdir(file):
			r += getFileList(path, filter)

Beispiel #2

0

Datei anzeigen

Datei: parsers.py Projekt: wcassw/pywebcopy

    def parse(self, parser=None, base_url=None):
        """Parses the underlying html source using `lxml` library.

        This parsed tree is stored in :attr:`root` of this object.
        which could be used to perform numerous operations.

        Returns
        -------
            ElementTree
        """
        utx = self._get_utx()

        assert utx is not None, "UrlTransformer not Implemented."  # internal error
        assert utx.base_path is not None, "Base Path is not set!"
        assert utx.base_url is not None, "Base url is not Set!"
        if not isinstance(parser, HTMLParser):
            TypeError("Expected instance of <%r>, got <%r>" %
                      (HTMLParser, parser))

        if not parser:
            parser = HTMLParser(encoding=self.encoding, collect_ids=False)

        source = self.get_source()

        assert source is not None, "Source is not Set!"
        assert hasattr(source, 'read'), "File like object is required!"
        # assert self._element_factory is not None
        # assert hasattr(self._element_factory, 'make_element')
        LOGGER.info(
            'Parsing tree with source: <%r> encoding <%s> and parser <%r>' %
            (self._source, self.encoding, parser))

        context_tree = lxml_parse(source, parser=parser, base_url=base_url)
        # The tree generated by the parse is stored in the self.root
        # variable and can be utilised further for any number of use cases
        self._tree = context_tree
        self.root = context_tree.getroot()

        if self.root is not None:
            # WaterMarking :)
            self.root.insert(
                0, Comment(MARK.format('', __version__, utx.url, utc_now(),
                                       '')))

        # There are internal links present on the html page which are files
        # that includes `#` and `javascript:` and 'data:base64;` type links
        # or a simple `/` url referring anchor tag
        # thus these links needs to be left as is.
        factory = getattr(self, 'make_element', None)
        assert callable(factory), "Element generator is not callable!"

        # Modify the tree elements
        for el in context_tree.iter():
            # A element can contain multiple urls
            for pack in self._handle_lxml_elem(el):

                if pack is not None:
                    elem, attr, url, pos = pack
                else:  # pragma: no cover
                    continue

                if elem is not None:
                    o = factory(elem, attr, url, pos)
                    if o is not None:
                        self._stack.append(o)

        self._parseComplete = True
        return self.root

Beispiel #3

0

Datei anzeigen

substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

'''
import json
from blox.base import Blox, Text, UnsafeText
from blox.all import factory
from xml.dom import minidom
from lxml.etree import HTMLParser, parse, fromstring

parser = HTMLParser()
SCRIPT_TEMPLATE = """# WARNING: DON'T EDIT AUTO-GENERATED

from blox.base import Blox, Text, UnsafeText


class Template(Blox):
{indent}__slots__ = tuple({accessors})


def build(factory):
{indent}template = Template()
{indent}{build_steps}
{indent}return template
"""

Beispiel #4

0

Datei anzeigen

def extract_bleach(text):
    """Exctract tags from text in a form suitable for bleach."""
    extractor = MarkupExtractor()
    parser = HTMLParser(collect_ids=False, target=extractor)
    parser.feed(text)
    return {"tags": extractor.found_tags, "attributes": extractor.found_attributes}

Beispiel #5

0

Datei anzeigen

Datei: misc.py Projekt: mrmoyeez/XacCRM

def html2plaintext(html, body_id=None, encoding='utf-8'):
    ## (c) Fry-IT, www.fry-it.com, 2007
    ## <*****@*****.**>
    ## download here: http://www.peterbe.com/plog/html2plaintext


    """ from an HTML text, convert the HTML to plain text.
    If @body_id is provided then this is the tag where the
    body (not necessarily <body>) starts.
    """

    html = ustr(html)

    from lxml.etree import Element, tostring
    try:
        from lxml.html.soupparser import fromstring
        kwargs = {}
    except ImportError:
        _logger.debug('tools.misc.html2plaintext: cannot use BeautifulSoup, fallback to lxml.etree.HTMLParser')
        from lxml.etree import fromstring, HTMLParser
        kwargs = dict(parser=HTMLParser())

    tree = fromstring(html, **kwargs)

    if body_id is not None:
        source = tree.xpath('//*[@id=%s]'%(body_id,))
    else:
        source = tree.xpath('//body')
    if len(source):
        tree = source[0]

    url_index = []
    i = 0
    for link in tree.findall('.//a'):
        title = link.text
        url = link.get('href')
        if url:
            i += 1
            link.tag = 'span'
            link.text = '%s [%s]' % (link.text, i)
            url_index.append(url)

    html = ustr(tostring(tree, encoding=encoding))

    html = html.replace('<strong>','*').replace('</strong>','*')
    html = html.replace('<b>','*').replace('</b>','*')
    html = html.replace('<h3>','*').replace('</h3>','*')
    html = html.replace('<h2>','**').replace('</h2>','**')
    html = html.replace('<h1>','**').replace('</h1>','**')
    html = html.replace('<em>','/').replace('</em>','/')
    html = html.replace('<tr>', '\n')
    html = html.replace('</p>', '\n')
    html = re.sub('<br\s*/?>', '\n', html)
    html = re.sub('<.*?>', ' ', html)
    html = html.replace(' ' * 2, ' ')

    # strip all lines
    html = '\n'.join([x.strip() for x in html.splitlines()])
    html = html.replace('\n' * 2, '\n')

    for i, url in enumerate(url_index):
        if i == 0:
            html += '\n\n'
        html += ustr('[%s] %s\n') % (i+1, url)

    return html

Beispiel #6

0

Datei anzeigen

#!/usr/bin/env python
# coding: utf-8

import codecs
import sys
from locale import getpreferredencoding
from functools import partial
import logging

from lxml.etree import HTMLParser
from lxml.html import parse


logging.basicConfig(format='%(message)s', level=logging.INFO)

parse = partial(parse, parser=HTMLParser(encoding="cp1251"))
encoding = getpreferredencoding()
urls = set()
out = sys.stdout

columns = [
    u'Регион',
    u'УИК',
    u'Число избирателей, внесенных в список избирателей',
    u'Число избирательных бюллетеней, полученных участковой избирательной комиссией',
    u'Число избирательных бюллетеней, выданных избирателям, проголосовавшим досрочно',
    u'Число избирательных бюллетеней, выданных избирателям в помещении для голосования',
    u'Число избирательных бюллетеней, выданных избирателям вне помещения для голосования ',
    u'Число погашенных избирательных бюллетеней',
    u'Число избирательных бюллетеней в переносных ящиках для голосования',
    u'Число избирательных бюллетеней в стационарных ящиках для голосования',

Beispiel #7

0

Datei anzeigen

 def create_html_parser_unicode(self):
     unicode_parser = HTMLParser(encoding="utf-8")
     return unicode_parser

Beispiel #8

0

Datei anzeigen

Datei: account.py Projekt: orf53975/bringyourownproxies

    def create(cls, username, password, email, gender, **kwargs):
        def get_captcha_image():
            download_captcha = session.get('http://www.drtuber.com/captcha',
                                           proxies=proxy)
            captcha_data = io.BytesIO(download_captcha.content)
            return captcha_data

        if gender.lower() == 'm':
            gender = 'Male'
        if gender.lower() == 'f':
            gender = 'Female'

        http_settings = kwargs.get('http_settings', HttpSettings())

        session = kwargs.get('session', http_settings.session)
        proxy = kwargs.get('proxy', http_settings.proxy)
        captcha_solver = kwargs.get('captcha_solver', DEFAULT_CAPTCHA_SOLVER)
        maximum_waiting_time = kwargs.get('maximum_waiting_time',
                                          DEFAULT_CAPTCHA_MAXIMUM_WAITING)

        url = 'http://www.drtuber.com/ajax/popup_forms?form=signup'
        sign_up_form = session.get(url, proxies=proxy)

        doc = etree.fromstring(sign_up_form.json()['answer'], HTMLParser())

        found_form_id = doc.xpath('//input[@name="formId"]/@value')
        if not found_form_id:
            raise CannotFindVar(
                'Cannot find formId , required for creating an account')

        form_id = found_form_id[0]

        captcha_image = get_captcha_image()
        captcha_response = cls.submit_captcha_and_wait(
            captcha_image,
            maximum_waiting_time=maximum_waiting_time,
            captcha_solver=captcha_solver)

        url = 'http://www.drtuber.com/signup/do?ajax=true&json=true'

        post = {
            'username': username,
            'password': password,
            'password_confirm': password,
            'gender': gender,
            'email': email,
            'verification': captcha_response,
            'terms': 'on',
            'age': 'on',
            'formId': form_id,
            'type': 'free',
            'redirectUrl': '/',
            'from': ''
        }

        create_account = session.post(url, data=post, proxies=proxy)
        response = create_account.json()

        if response['errors']:
            raise AccountProblem('Cannot create drtuber account due to errors:' \
                                 '{e}'.format(e=" AND ".join(response['errors'])))

        remember_me = kwargs.get('remember_me', False)

        return cls(username=username,
                   password=password,
                   email=email,
                   remember_me=remember_me,
                   gender=gender)

Beispiel #9

0

Datei anzeigen

Datei: upload.py Projekt: orf53975/bringyourownproxies

    def start(self):

        try:
            if not isinstance(self.video_upload_request,
                              PornhubVideoUploadRequest):
                raise InvalidVideoUploadRequest(
                    'Invalid video_upload_request, '
                    'it needs to be a PornhubVideoUploadRequest instance')

            if not isinstance(self.account, PornhubAccount):
                raise InvalidAccount(
                    'Invalid account, it needs to be a PornhubAccount instance'
                )

            if not self.account.is_logined():
                raise NotLogined('Pornhub account is not logined')

            self.call_hook('started',
                           video_upload_request=self.video_upload_request,
                           account=self.account)

            session = self.account.http_settings.session
            proxy = self.account.http_settings.proxy

            go_to_upload = session.get('http://www.pornhub.com/upload/video',
                                       proxies=proxy)
            session.headers.update({"X-Requested-With": "XMLHttpRequest"})

            container = go_to_upload.content.\
                split('<div class="saveBlockContent">')[1].\
                split('</div><!-- /.saveBlockContent -->')[0]
            container = "".join(
                ['<div class="saveBlockContent">', container, '</div>'])
            doc = etree.fromstring(container, HTMLParser())

            get_cookie = doc.xpath('//input[@name="cookie[]"]/@value')
            get_platform_id = doc.xpath('//input[@name="platformId[]"]/@value')
            get_source = doc.xpath('//input[@name="source[]"]/@value')

            if len(get_cookie) == 0:
                raise CannotFindVar(
                    'Cannot find input field with name:cookies[]')
            if len(get_platform_id) == 0:
                raise CannotFindVar(
                    'Cannot find input field with name:platformId[]')
            if len(get_source) == 0:
                raise CannotFindVar(
                    'Cannot find input field with name:source[]')

            video_file = self.video_upload_request.video_file
            title = self.video_upload_request.title.name
            tags = self.video_upload_request.tags
            category = self.video_upload_request.category
            is_private = self.video_upload_request.is_private
            is_homemade = self.video_upload_request.is_homemade
            porn_stars = self.video_upload_request.porn_stars

            if not isinstance(category, (tuple, list)):
                categories = '["{cat}"]'.format(cat=str(category.category_id))
            else:
                categories = [
                    '"{c}"'.format(c=str(cat.category_id)) for cat in category
                ]
                categories = '[{cats}]'.format(cats=','.join(categories))

            if isinstance(tags, (tuple, list)):
                tags = " ".join([t.name for t in tags])

            privacy = "private" if is_private else "community"
            production = "homemade" if is_homemade else "professional"
            fields = []

            fields.append(('title', title))
            fields.append(('callbackUrl', ''))
            fields.append(('platformId', str(get_platform_id[0])))
            fields.append(('categories', categories))
            fields.append(('tags', tags))
            fields.append(('privacy', privacy))
            fields.append(('source', str(get_source[0])))
            fields.append(('pornstars', porn_stars))
            fields.append(('cookie', get_cookie[0]))
            fields.append(('production', production))
            fields.append(('timestamp', str(generate_timestamp())))
            fields.append(('isPremiumVideo', "0"))
            fields.append(('Filedata', (path.Path(video_file).name,
                                        open(video_file, 'rb'))))
            encoder = type(self).create_multipart_encoder(fields)

            self.upload_monitor = type(self).create_multipart_monitor(
                encoder=encoder, callback=self._hooks['uploading'])

            self.call_hook('uploading',
                           video_upload_request=self.video_upload_request,
                           account=self.account)

            get_upload_url = re.search(r"var\s+url\s+=\s+'(.*?)',",
                                       go_to_upload.content)
            if not get_upload_url:
                raise CannotFindVar(
                    'Cannot find uploading url for pornhub.com')

            url = get_upload_url.group(1)
            upload_video = session.post(
                url,
                data=self.upload_monitor,
                proxies=proxy,
                headers={'Content-Type': self.upload_monitor.content_type})
            find_str = '{"@type":["GorillaHub\\\SDKs\\\SDKBundle\\\V0001\\\Domain\\\Responses\\\FileUploadedResponse",[]]'
            if find_str not in upload_video.content:
                raise FailedUpload(
                    'Upload possibly failed, did not find success string:{string}'
                    .format(string=find_str))
        except Exception as exc:
            del session.headers["X-Requested-With"]
            self.call_hook('failed',
                           video_upload_request=self.video_upload_request,
                           account=self.account,
                           traceback=traceback.format_exc(),
                           exc_info=sys.exc_info())

            print traceback.format_exc()
            if self.bubble_up_exception:
                raise exc

        else:
            del session.headers["X-Requested-With"]
            self.call_hook('finished',
                           video_request=self.video_upload_request,
                           account=self.account,
                           settings={''})

            return {'status': True}

Beispiel #10

0

Datei anzeigen

import re
from pprint import pprint

from flask import current_app, Flask, render_template, request, session, jsonify, abort, Blueprint
from app.models import Answers, Questions, db
from app.functions import *
from . import search

from lxml import html
from lxml.etree import HTMLParser
from app.functions import AlchemyEncoder
from stackapi import StackAPI
from urllib.parse import unquote
from app.celery_tasks import insertQuestion, insertAnswer

whitespace_parser = HTMLParser(remove_blank_text=True)
stackOverflowConnection = StackAPI('stackoverflow')
stackOverflowConnection.page_size = 20
stackOverflowConnection.max_pages = 1

bp = Blueprint('main', __name__, url_prefix='/', static_folder='static')


def stack_get_answers(question_id):
    """
    Api -> https://api.stackexchange.com/docs/types/answer
    """
    returnedAnswers = stackOverflowConnection.fetch('questions/{ids}/answers',
                                                    ids=[int(question_id)],
                                                    sort='votes',
                                                    filter="withbody")

Beispiel #11

0

Datei anzeigen

    def create(cls, username, password, email, **kwargs):

        from lxml import etree
        from lxml.etree import HTMLParser

        from bringyourownproxies.httpclient import HttpSettings
        from bringyourownproxies.errors import CannotFindVar

        remember_me = kwargs.get('remember_me', False)
        http_settings = kwargs.get('http_settings', HttpSettings())
        session = http_settings.session
        proxy = http_settings.proxy

        session.get('http://www.pornhub.com', proxies=proxy)

        create_page = session.get('http://www.pornhub.com/create_account',
                                  proxies=proxy)

        doc = etree.fromstring(create_page.content, HTMLParser())

        found_signup_key = doc.xpath('//input[@name="signup_key"]/@value')
        found_signup_hash = doc.xpath('//input[@name="signup_hash"]/@value')
        found_signup_id = doc.xpath('//input[@name="signup_id"]/@value')

        if not found_signup_key:
            raise CannotFindVar('Cannot find signup_key in pornhub.com')
        if not found_signup_hash:
            raise CannotFindVar('Cannot find signup_hash in pornhub.com')
        if not found_signup_id:
            raise CannotFindVar('Cannot find signup_id in pornhub.com')

        signup_key = found_signup_key[0]
        signup_hash = found_signup_hash[0]
        signup_id = found_signup_id[0]

        post = {
            'signup_key': signup_key,
            'signup_hash': signup_hash,
            'signup_id': signup_id,
            'check_what': 'username',
            'email': email,
            'username': username,
            'password': password,
            'agreed': '1'
        }
        session.headers.update({'X-Requested-With': 'XMLHttpRequest'})
        session.post('http://www.pornhub.com/user/create_account_check',
                     proxies=proxy)

        create_account = session.post('http://www.pornhub.com/create_account',
                                      data=post,
                                      proxies=proxy)

        errors = []
        doc = etree.fromstring(create_account.content, HTMLParser())
        found_errors = doc.xpath('//div[@class="error"]/div')

        if found_errors:
            errors = [
                error.text[2:len(error.text) - 1] for error in found_errors
            ]
            raise AccountProblem(
                'Failed creating account at pornhub due to errors:{e}'.format(
                    e=' AND '.join(errors)))
        found_confirmation = doc.xpath(
            '//div[@class="sprite-signup-confirmation absolute"]')

        if found_confirmation:
            return True
        else:
            raise AccountProblem(
                'Failed creating account at pornhub for unknown problem')

Beispiel #12

0

Datei anzeigen

git_path = 'chromiumos/platform/crosvm'
git_root = 'https://chromium.googlesource.com/'
manifest_versions = f'{git_root}chromiumos/manifest-versions'
buildspecs_url = f'{manifest_versions}/+/refs/heads/master/full/buildspecs/'

# CrOS version numbers look like this:
# [<chrome-major-version>.]<tip-build>.<branch-build>.<branch-branch-build>
#
# As far as I can tell, branches are where internal Google
# modifications are added to turn Chromium OS into Chrome OS, and
# branch branches are used for fixes for specific devices.  So for
# Chromium OS they will always be 0.  This is a best guess, and is not
# documented.
with urlopen('https://cros-updates-serving.appspot.com/') as resp:
    document = etree.parse(resp, HTMLParser())
    # bgcolor="lightgreen" is set on the most up-to-date version for
    # each channel, so find a lightgreen cell in the "Stable" column.
    (platform_version, chrome_version) = document.xpath("""
        (//table[@id="cros-updates"]/tr/td[1 + count(
            //table[@id="cros-updates"]/thead/tr[1]/th[text() = "Stable"]
            /preceding-sibling::*)
        ][@bgcolor="lightgreen"])[1]/text()
    """)

chrome_major_version = re.match(r'\d+', chrome_version)[0]
chromeos_tip_build = re.match(r'\d+', platform_version)[0]

# Find the most recent buildspec for the stable Chrome version and
# Chromium OS build number.  Its branch build and branch branch build
# numbers will (almost?) certainly be 0.  It will then end with an rc

Beispiel #13

0

Datei anzeigen

    def upload(self,
               video_file,
               title,
               description,
               categories,
               tags,
               is_private,
               callback=None,
               session=None,
               proxy=None,
               **kwargs):

        session = session or self.http_settings.session
        proxy = proxy or self.http_settings.proxy

        no_tags = kwargs.get('no_tags', False)
        add_content_source_id = kwargs.get('add_content_source_id', False)

        my_video_upload_url = 'http://www.{domain}/my_video_upload/'.format(
            domain=self.domain)

        go_to_upload = session.get(my_video_upload_url, proxies=proxy)

        doc = etree.fromstring(go_to_upload.content, HTMLParser())

        filehash = self._upload_video(video_file, callback, session, proxy)

        fields = []
        if add_content_source_id:
            found_content_source_id = doc.xpath(
                '//input[@name="content_source_id"]/@value')
            if not found_content_source_id:
                raise NginxUploaderProblem(
                    'Cannot find required variable content_source_id')
            content_source_id = found_content_source_id[0]
            fields.append(('content_source_id', str(content_source_id)))

        if not no_tags:
            fields.append(('tags', str(",".join([tag for tag in tags]))))
        fields.append(('action', 'add_new_complete'))
        fields.append(('title', str(title)))
        fields.append(('description', str(description)))
        fields.append(('file', str(path.Path(video_file).name)))
        fields.append(('file_hash', str(filehash)))
        fields.append(('is_private', "1" if is_private else "0"))

        for category in categories:
            fields.append(('category_ids[]', str(category)))
        encoder = MultipartEncoder(fields)

        if callback:
            monitor = MultipartEncoderMonitor(encoder, callback)
        else:
            monitor = MultipartEncoderMonitor(encoder)

        #my_video_upload_url = 'http://httpbin.org/post'
        submit_video = session.post(
            my_video_upload_url,
            data=monitor,
            proxies=proxy,
            headers={'Content-Type': monitor.content_type})

Beispiel #14

0

Datei anzeigen

    def _upload_video(self, video_file, callback, session, proxy):

        upload_path = self._get_path_to_upload()
        go_to_upload = session.get(upload_path, proxies=proxy)

        upload_form_url = self._get_path_to_users_upload()

        session.headers.update({'X-Requested-With': 'XMLHttpRequest'})
        get_upload_form = session.get(upload_form_url,
                                      proxies=proxy,
                                      verify=False)

        regex_api_key = r'var\s+kumm_api_key\s+=\s+"(.*?)"'
        regex_user_id = r'var\s+user_id\s+=\s+"(.*?)"'
        regex_callback_url = r'var\s+callback_url\s+=\s+"(.*?)"'

        found_api_key = re.search(regex_api_key, get_upload_form.content)
        found_user_id = re.search(regex_user_id, get_upload_form.content)
        found_callback_url = re.search(regex_callback_url,
                                       get_upload_form.content)

        if not found_api_key:
            raise KummProblem('Could not find api_key')
        if not found_user_id:
            raise KummProblem('Could not find user_id')
        if not found_callback_url:
            raise KummProblem('Could not find callback_url')

        api_key = found_api_key.group(1)
        user_id = found_user_id.group(1)
        callback_url = found_callback_url.group(1)

        doc = etree.fromstring(get_upload_form.content, HTMLParser())
        get_upload_url = doc.xpath('//input[@id="fileupload"]/@data-url')

        if len(get_upload_url) == 0:
            raise KummProblem(
                'Could not find kumm posting url for the video upload')

        posting_url = get_upload_url[0]
        session.options(posting_url, proxies=proxy, verify=False)

        fields = []
        fields.append(('token', api_key))
        fields.append(('callBackUrl', callback_url))
        fields.append(('website', self.website))
        fields.append(('userId', user_id))
        fields.append(
            ('files[]', (path.Path(video_file).name, open(video_file, 'rb'))))

        encoder = MultipartEncoder(fields)
        if callback:
            monitor = MultipartEncoderMonitor(encoder, callback)
        else:
            monitor = MultipartEncoderMonitor(encoder)

        submit_upload = session.post(
            posting_url,
            data=monitor,
            headers={'Content-Type': monitor.content_type},
            proxies=proxy,
            verify=False)
        try:
            response = submit_upload.json()
        except:
            raise KummProblem(
                'Expecting json, did not receive json after video uploading')
        else:
            if 'err' in response:
                raise KummProblem(
                    'Kumm uploader experienced an error after uploading:{err}'.
                    format(err=response['err']))
            elif 'uuid' in response:
                return response['uuid']