fname_pixiv_parttern = "{user} - {title} ({work_id}@{user_id})[{tags}]{{{tools}}}" dropboxed = True opt_tagFile = opt_renameFile = False import os, sys, re, time, io import urllib, json from optparse import OptionParser from PIL import Image import lxml.html as lhtml from lxml.etree import HTMLParser utf8Parser = HTMLParser(encoding="utf-8") defaultFilter = re.compile('.*\..*p.*g') dir_slash = '/' def clearifyText(s): s = s.replace('\n','') s = ' '.join([t for t in s.split(' ') if t]) s = '\t'.join([t for t in s.split('\t') if t]) return s def getFileList(path, filter=defaultFilter): r = [] for file in os.listdir(path): if os.path.isdir(file): r += getFileList(path, filter)
def parse(self, parser=None, base_url=None): """Parses the underlying html source using `lxml` library. This parsed tree is stored in :attr:`root` of this object. which could be used to perform numerous operations. Returns ------- ElementTree """ utx = self._get_utx() assert utx is not None, "UrlTransformer not Implemented." # internal error assert utx.base_path is not None, "Base Path is not set!" assert utx.base_url is not None, "Base url is not Set!" if not isinstance(parser, HTMLParser): TypeError("Expected instance of <%r>, got <%r>" % (HTMLParser, parser)) if not parser: parser = HTMLParser(encoding=self.encoding, collect_ids=False) source = self.get_source() assert source is not None, "Source is not Set!" assert hasattr(source, 'read'), "File like object is required!" # assert self._element_factory is not None # assert hasattr(self._element_factory, 'make_element') LOGGER.info( 'Parsing tree with source: <%r> encoding <%s> and parser <%r>' % (self._source, self.encoding, parser)) context_tree = lxml_parse(source, parser=parser, base_url=base_url) # The tree generated by the parse is stored in the self.root # variable and can be utilised further for any number of use cases self._tree = context_tree self.root = context_tree.getroot() if self.root is not None: # WaterMarking :) self.root.insert( 0, Comment(MARK.format('', __version__, utx.url, utc_now(), ''))) # There are internal links present on the html page which are files # that includes `#` and `javascript:` and 'data:base64;` type links # or a simple `/` url referring anchor tag # thus these links needs to be left as is. factory = getattr(self, 'make_element', None) assert callable(factory), "Element generator is not callable!" # Modify the tree elements for el in context_tree.iter(): # A element can contain multiple urls for pack in self._handle_lxml_elem(el): if pack is not None: elem, attr, url, pos = pack else: # pragma: no cover continue if elem is not None: o = factory(elem, attr, url, pos) if o is not None: self._stack.append(o) self._parseComplete = True return self.root
substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ''' import json from blox.base import Blox, Text, UnsafeText from blox.all import factory from xml.dom import minidom from lxml.etree import HTMLParser, parse, fromstring parser = HTMLParser() SCRIPT_TEMPLATE = """# WARNING: DON'T EDIT AUTO-GENERATED from blox.base import Blox, Text, UnsafeText class Template(Blox): {indent}__slots__ = tuple({accessors}) def build(factory): {indent}template = Template() {indent}{build_steps} {indent}return template """
def extract_bleach(text): """Exctract tags from text in a form suitable for bleach.""" extractor = MarkupExtractor() parser = HTMLParser(collect_ids=False, target=extractor) parser.feed(text) return {"tags": extractor.found_tags, "attributes": extractor.found_attributes}
def html2plaintext(html, body_id=None, encoding='utf-8'): ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext """ from an HTML text, convert the HTML to plain text. If @body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ html = ustr(html) from lxml.etree import Element, tostring try: from lxml.html.soupparser import fromstring kwargs = {} except ImportError: _logger.debug('tools.misc.html2plaintext: cannot use BeautifulSoup, fallback to lxml.etree.HTMLParser') from lxml.etree import fromstring, HTMLParser kwargs = dict(parser=HTMLParser()) tree = fromstring(html, **kwargs) if body_id is not None: source = tree.xpath('//*[@id=%s]'%(body_id,)) else: source = tree.xpath('//body') if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall('.//a'): title = link.text url = link.get('href') if url: i += 1 link.tag = 'span' link.text = '%s [%s]' % (link.text, i) url_index.append(url) html = ustr(tostring(tree, encoding=encoding)) html = html.replace('<strong>','*').replace('</strong>','*') html = html.replace('<b>','*').replace('</b>','*') html = html.replace('<h3>','*').replace('</h3>','*') html = html.replace('<h2>','**').replace('</h2>','**') html = html.replace('<h1>','**').replace('</h1>','**') html = html.replace('<em>','/').replace('</em>','/') html = html.replace('<tr>', '\n') html = html.replace('</p>', '\n') html = re.sub('<br\s*/?>', '\n', html) html = re.sub('<.*?>', ' ', html) html = html.replace(' ' * 2, ' ') # strip all lines html = '\n'.join([x.strip() for x in html.splitlines()]) html = html.replace('\n' * 2, '\n') for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += ustr('[%s] %s\n') % (i+1, url) return html
#!/usr/bin/env python # coding: utf-8 import codecs import sys from locale import getpreferredencoding from functools import partial import logging from lxml.etree import HTMLParser from lxml.html import parse logging.basicConfig(format='%(message)s', level=logging.INFO) parse = partial(parse, parser=HTMLParser(encoding="cp1251")) encoding = getpreferredencoding() urls = set() out = sys.stdout columns = [ u'Регион', u'УИК', u'Число избирателей, внесенных в список избирателей', u'Число избирательных бюллетеней, полученных участковой избирательной комиссией', u'Число избирательных бюллетеней, выданных избирателям, проголосовавшим досрочно', u'Число избирательных бюллетеней, выданных избирателям в помещении для голосования', u'Число избирательных бюллетеней, выданных избирателям вне помещения для голосования ', u'Число погашенных избирательных бюллетеней', u'Число избирательных бюллетеней в переносных ящиках для голосования', u'Число избирательных бюллетеней в стационарных ящиках для голосования',
def create_html_parser_unicode(self): unicode_parser = HTMLParser(encoding="utf-8") return unicode_parser
def create(cls, username, password, email, gender, **kwargs): def get_captcha_image(): download_captcha = session.get('http://www.drtuber.com/captcha', proxies=proxy) captcha_data = io.BytesIO(download_captcha.content) return captcha_data if gender.lower() == 'm': gender = 'Male' if gender.lower() == 'f': gender = 'Female' http_settings = kwargs.get('http_settings', HttpSettings()) session = kwargs.get('session', http_settings.session) proxy = kwargs.get('proxy', http_settings.proxy) captcha_solver = kwargs.get('captcha_solver', DEFAULT_CAPTCHA_SOLVER) maximum_waiting_time = kwargs.get('maximum_waiting_time', DEFAULT_CAPTCHA_MAXIMUM_WAITING) url = 'http://www.drtuber.com/ajax/popup_forms?form=signup' sign_up_form = session.get(url, proxies=proxy) doc = etree.fromstring(sign_up_form.json()['answer'], HTMLParser()) found_form_id = doc.xpath('//input[@name="formId"]/@value') if not found_form_id: raise CannotFindVar( 'Cannot find formId , required for creating an account') form_id = found_form_id[0] captcha_image = get_captcha_image() captcha_response = cls.submit_captcha_and_wait( captcha_image, maximum_waiting_time=maximum_waiting_time, captcha_solver=captcha_solver) url = 'http://www.drtuber.com/signup/do?ajax=true&json=true' post = { 'username': username, 'password': password, 'password_confirm': password, 'gender': gender, 'email': email, 'verification': captcha_response, 'terms': 'on', 'age': 'on', 'formId': form_id, 'type': 'free', 'redirectUrl': '/', 'from': '' } create_account = session.post(url, data=post, proxies=proxy) response = create_account.json() if response['errors']: raise AccountProblem('Cannot create drtuber account due to errors:' \ '{e}'.format(e=" AND ".join(response['errors']))) remember_me = kwargs.get('remember_me', False) return cls(username=username, password=password, email=email, remember_me=remember_me, gender=gender)
def start(self): try: if not isinstance(self.video_upload_request, PornhubVideoUploadRequest): raise InvalidVideoUploadRequest( 'Invalid video_upload_request, ' 'it needs to be a PornhubVideoUploadRequest instance') if not isinstance(self.account, PornhubAccount): raise InvalidAccount( 'Invalid account, it needs to be a PornhubAccount instance' ) if not self.account.is_logined(): raise NotLogined('Pornhub account is not logined') self.call_hook('started', video_upload_request=self.video_upload_request, account=self.account) session = self.account.http_settings.session proxy = self.account.http_settings.proxy go_to_upload = session.get('http://www.pornhub.com/upload/video', proxies=proxy) session.headers.update({"X-Requested-With": "XMLHttpRequest"}) container = go_to_upload.content.\ split('<div class="saveBlockContent">')[1].\ split('</div><!-- /.saveBlockContent -->')[0] container = "".join( ['<div class="saveBlockContent">', container, '</div>']) doc = etree.fromstring(container, HTMLParser()) get_cookie = doc.xpath('//input[@name="cookie[]"]/@value') get_platform_id = doc.xpath('//input[@name="platformId[]"]/@value') get_source = doc.xpath('//input[@name="source[]"]/@value') if len(get_cookie) == 0: raise CannotFindVar( 'Cannot find input field with name:cookies[]') if len(get_platform_id) == 0: raise CannotFindVar( 'Cannot find input field with name:platformId[]') if len(get_source) == 0: raise CannotFindVar( 'Cannot find input field with name:source[]') video_file = self.video_upload_request.video_file title = self.video_upload_request.title.name tags = self.video_upload_request.tags category = self.video_upload_request.category is_private = self.video_upload_request.is_private is_homemade = self.video_upload_request.is_homemade porn_stars = self.video_upload_request.porn_stars if not isinstance(category, (tuple, list)): categories = '["{cat}"]'.format(cat=str(category.category_id)) else: categories = [ '"{c}"'.format(c=str(cat.category_id)) for cat in category ] categories = '[{cats}]'.format(cats=','.join(categories)) if isinstance(tags, (tuple, list)): tags = " ".join([t.name for t in tags]) privacy = "private" if is_private else "community" production = "homemade" if is_homemade else "professional" fields = [] fields.append(('title', title)) fields.append(('callbackUrl', '')) fields.append(('platformId', str(get_platform_id[0]))) fields.append(('categories', categories)) fields.append(('tags', tags)) fields.append(('privacy', privacy)) fields.append(('source', str(get_source[0]))) fields.append(('pornstars', porn_stars)) fields.append(('cookie', get_cookie[0])) fields.append(('production', production)) fields.append(('timestamp', str(generate_timestamp()))) fields.append(('isPremiumVideo', "0")) fields.append(('Filedata', (path.Path(video_file).name, open(video_file, 'rb')))) encoder = type(self).create_multipart_encoder(fields) self.upload_monitor = type(self).create_multipart_monitor( encoder=encoder, callback=self._hooks['uploading']) self.call_hook('uploading', video_upload_request=self.video_upload_request, account=self.account) get_upload_url = re.search(r"var\s+url\s+=\s+'(.*?)',", go_to_upload.content) if not get_upload_url: raise CannotFindVar( 'Cannot find uploading url for pornhub.com') url = get_upload_url.group(1) upload_video = session.post( url, data=self.upload_monitor, proxies=proxy, headers={'Content-Type': self.upload_monitor.content_type}) find_str = '{"@type":["GorillaHub\\\SDKs\\\SDKBundle\\\V0001\\\Domain\\\Responses\\\FileUploadedResponse",[]]' if find_str not in upload_video.content: raise FailedUpload( 'Upload possibly failed, did not find success string:{string}' .format(string=find_str)) except Exception as exc: del session.headers["X-Requested-With"] self.call_hook('failed', video_upload_request=self.video_upload_request, account=self.account, traceback=traceback.format_exc(), exc_info=sys.exc_info()) print traceback.format_exc() if self.bubble_up_exception: raise exc else: del session.headers["X-Requested-With"] self.call_hook('finished', video_request=self.video_upload_request, account=self.account, settings={''}) return {'status': True}
import re from pprint import pprint from flask import current_app, Flask, render_template, request, session, jsonify, abort, Blueprint from app.models import Answers, Questions, db from app.functions import * from . import search from lxml import html from lxml.etree import HTMLParser from app.functions import AlchemyEncoder from stackapi import StackAPI from urllib.parse import unquote from app.celery_tasks import insertQuestion, insertAnswer whitespace_parser = HTMLParser(remove_blank_text=True) stackOverflowConnection = StackAPI('stackoverflow') stackOverflowConnection.page_size = 20 stackOverflowConnection.max_pages = 1 bp = Blueprint('main', __name__, url_prefix='/', static_folder='static') def stack_get_answers(question_id): """ Api -> https://api.stackexchange.com/docs/types/answer """ returnedAnswers = stackOverflowConnection.fetch('questions/{ids}/answers', ids=[int(question_id)], sort='votes', filter="withbody")
def create(cls, username, password, email, **kwargs): from lxml import etree from lxml.etree import HTMLParser from bringyourownproxies.httpclient import HttpSettings from bringyourownproxies.errors import CannotFindVar remember_me = kwargs.get('remember_me', False) http_settings = kwargs.get('http_settings', HttpSettings()) session = http_settings.session proxy = http_settings.proxy session.get('http://www.pornhub.com', proxies=proxy) create_page = session.get('http://www.pornhub.com/create_account', proxies=proxy) doc = etree.fromstring(create_page.content, HTMLParser()) found_signup_key = doc.xpath('//input[@name="signup_key"]/@value') found_signup_hash = doc.xpath('//input[@name="signup_hash"]/@value') found_signup_id = doc.xpath('//input[@name="signup_id"]/@value') if not found_signup_key: raise CannotFindVar('Cannot find signup_key in pornhub.com') if not found_signup_hash: raise CannotFindVar('Cannot find signup_hash in pornhub.com') if not found_signup_id: raise CannotFindVar('Cannot find signup_id in pornhub.com') signup_key = found_signup_key[0] signup_hash = found_signup_hash[0] signup_id = found_signup_id[0] post = { 'signup_key': signup_key, 'signup_hash': signup_hash, 'signup_id': signup_id, 'check_what': 'username', 'email': email, 'username': username, 'password': password, 'agreed': '1' } session.headers.update({'X-Requested-With': 'XMLHttpRequest'}) session.post('http://www.pornhub.com/user/create_account_check', proxies=proxy) create_account = session.post('http://www.pornhub.com/create_account', data=post, proxies=proxy) errors = [] doc = etree.fromstring(create_account.content, HTMLParser()) found_errors = doc.xpath('//div[@class="error"]/div') if found_errors: errors = [ error.text[2:len(error.text) - 1] for error in found_errors ] raise AccountProblem( 'Failed creating account at pornhub due to errors:{e}'.format( e=' AND '.join(errors))) found_confirmation = doc.xpath( '//div[@class="sprite-signup-confirmation absolute"]') if found_confirmation: return True else: raise AccountProblem( 'Failed creating account at pornhub for unknown problem')
git_path = 'chromiumos/platform/crosvm' git_root = 'https://chromium.googlesource.com/' manifest_versions = f'{git_root}chromiumos/manifest-versions' buildspecs_url = f'{manifest_versions}/+/refs/heads/master/full/buildspecs/' # CrOS version numbers look like this: # [<chrome-major-version>.]<tip-build>.<branch-build>.<branch-branch-build> # # As far as I can tell, branches are where internal Google # modifications are added to turn Chromium OS into Chrome OS, and # branch branches are used for fixes for specific devices. So for # Chromium OS they will always be 0. This is a best guess, and is not # documented. with urlopen('https://cros-updates-serving.appspot.com/') as resp: document = etree.parse(resp, HTMLParser()) # bgcolor="lightgreen" is set on the most up-to-date version for # each channel, so find a lightgreen cell in the "Stable" column. (platform_version, chrome_version) = document.xpath(""" (//table[@id="cros-updates"]/tr/td[1 + count( //table[@id="cros-updates"]/thead/tr[1]/th[text() = "Stable"] /preceding-sibling::*) ][@bgcolor="lightgreen"])[1]/text() """) chrome_major_version = re.match(r'\d+', chrome_version)[0] chromeos_tip_build = re.match(r'\d+', platform_version)[0] # Find the most recent buildspec for the stable Chrome version and # Chromium OS build number. Its branch build and branch branch build # numbers will (almost?) certainly be 0. It will then end with an rc
def upload(self, video_file, title, description, categories, tags, is_private, callback=None, session=None, proxy=None, **kwargs): session = session or self.http_settings.session proxy = proxy or self.http_settings.proxy no_tags = kwargs.get('no_tags', False) add_content_source_id = kwargs.get('add_content_source_id', False) my_video_upload_url = 'http://www.{domain}/my_video_upload/'.format( domain=self.domain) go_to_upload = session.get(my_video_upload_url, proxies=proxy) doc = etree.fromstring(go_to_upload.content, HTMLParser()) filehash = self._upload_video(video_file, callback, session, proxy) fields = [] if add_content_source_id: found_content_source_id = doc.xpath( '//input[@name="content_source_id"]/@value') if not found_content_source_id: raise NginxUploaderProblem( 'Cannot find required variable content_source_id') content_source_id = found_content_source_id[0] fields.append(('content_source_id', str(content_source_id))) if not no_tags: fields.append(('tags', str(",".join([tag for tag in tags])))) fields.append(('action', 'add_new_complete')) fields.append(('title', str(title))) fields.append(('description', str(description))) fields.append(('file', str(path.Path(video_file).name))) fields.append(('file_hash', str(filehash))) fields.append(('is_private', "1" if is_private else "0")) for category in categories: fields.append(('category_ids[]', str(category))) encoder = MultipartEncoder(fields) if callback: monitor = MultipartEncoderMonitor(encoder, callback) else: monitor = MultipartEncoderMonitor(encoder) #my_video_upload_url = 'http://httpbin.org/post' submit_video = session.post( my_video_upload_url, data=monitor, proxies=proxy, headers={'Content-Type': monitor.content_type})
def _upload_video(self, video_file, callback, session, proxy): upload_path = self._get_path_to_upload() go_to_upload = session.get(upload_path, proxies=proxy) upload_form_url = self._get_path_to_users_upload() session.headers.update({'X-Requested-With': 'XMLHttpRequest'}) get_upload_form = session.get(upload_form_url, proxies=proxy, verify=False) regex_api_key = r'var\s+kumm_api_key\s+=\s+"(.*?)"' regex_user_id = r'var\s+user_id\s+=\s+"(.*?)"' regex_callback_url = r'var\s+callback_url\s+=\s+"(.*?)"' found_api_key = re.search(regex_api_key, get_upload_form.content) found_user_id = re.search(regex_user_id, get_upload_form.content) found_callback_url = re.search(regex_callback_url, get_upload_form.content) if not found_api_key: raise KummProblem('Could not find api_key') if not found_user_id: raise KummProblem('Could not find user_id') if not found_callback_url: raise KummProblem('Could not find callback_url') api_key = found_api_key.group(1) user_id = found_user_id.group(1) callback_url = found_callback_url.group(1) doc = etree.fromstring(get_upload_form.content, HTMLParser()) get_upload_url = doc.xpath('//input[@id="fileupload"]/@data-url') if len(get_upload_url) == 0: raise KummProblem( 'Could not find kumm posting url for the video upload') posting_url = get_upload_url[0] session.options(posting_url, proxies=proxy, verify=False) fields = [] fields.append(('token', api_key)) fields.append(('callBackUrl', callback_url)) fields.append(('website', self.website)) fields.append(('userId', user_id)) fields.append( ('files[]', (path.Path(video_file).name, open(video_file, 'rb')))) encoder = MultipartEncoder(fields) if callback: monitor = MultipartEncoderMonitor(encoder, callback) else: monitor = MultipartEncoderMonitor(encoder) submit_upload = session.post( posting_url, data=monitor, headers={'Content-Type': monitor.content_type}, proxies=proxy, verify=False) try: response = submit_upload.json() except: raise KummProblem( 'Expecting json, did not receive json after video uploading') else: if 'err' in response: raise KummProblem( 'Kumm uploader experienced an error after uploading:{err}'. format(err=response['err'])) elif 'uuid' in response: return response['uuid']