def test_repr(self): A = namedtuple('A', 'x') self.assertEqual(repr(A(1)), 'A(x=1)') # repr should show the name of the subclass class B(A): pass self.assertEqual(repr(B(1)), 'B(x=1)')
def test_name_conflicts(self): # Some names like "self", "cls", "tuple", "itemgetter", and "property" # failed when used as field names. Test to make sure these now work. T = namedtuple('T', 'itemgetter property self cls tuple') t = T(1, 2, 3, 4, 5) self.assertEqual(t, (1,2,3,4,5)) newt = t._replace(itemgetter=10, property=20, self=30, cls=40, tuple=50) self.assertEqual(newt, (10,20,30,40,50))
def test_factory(self): Point = namedtuple('Point', 'x y') self.assertEqual(Point.__name__, 'Point') self.assertEqual(Point.__slots__, ()) self.assertEqual(Point.__module__, __name__) self.assertEqual(Point.__getitem__, tuple.__getitem__) self.assertEqual(Point._fields, ('x', 'y')) self.assertRaises(ValueError, namedtuple, 'abc%', 'efg ghi') # type has non-alpha char self.assertRaises(ValueError, namedtuple, 'class', 'efg ghi') # type has keyword self.assertRaises(ValueError, namedtuple, '9abc', 'efg ghi') # type starts with digit self.assertRaises(ValueError, namedtuple, 'abc', 'efg g%hi') # field with non-alpha char self.assertRaises(ValueError, namedtuple, 'abc', 'abc class') # field has keyword self.assertRaises(ValueError, namedtuple, 'abc', '8efg 9ghi') # field starts with digit self.assertRaises(ValueError, namedtuple, 'abc', '_efg ghi') # field with leading underscore self.assertRaises(ValueError, namedtuple, 'abc', 'efg efg ghi') # duplicate field namedtuple('Point0', 'x1 y2') # Verify that numbers are allowed in names namedtuple('_', 'a b c') # Test leading underscores in a typename nt = namedtuple('nt', 'the quick brown fox') # check unicode input self.assertNotIn("u'", repr(nt._fields)) nt = namedtuple('nt', ('the', 'quick')) # check unicode input self.assertNotIn("u'", repr(nt._fields)) self.assertRaises(TypeError, Point._make, [11]) # catch too few args self.assertRaises(TypeError, Point._make, [11, 22, 33]) # catch too many args
def test_name_fixer(self): for spec, renamed in [ [('efg', 'g%hi'), ('efg', '_1')], # field with non-alpha char [('abc', 'class'), ('abc', '_1')], # field has keyword [('8efg', '9ghi'), ('_0', '_1')], # field starts with digit [('abc', '_efg'), ('abc', '_1')], # field with leading underscore [('abc', 'efg', 'efg', 'ghi'), ('abc', 'efg', '_2', 'ghi')], # duplicate field [('abc', '', 'x'), ('abc', '_1', 'x')], # fieldname is a space ]: self.assertEqual(namedtuple('NT', spec, rename=True)._fields, renamed)
def test_odd_sizes(self): Zero = namedtuple('Zero', '') self.assertEqual(Zero(), ()) self.assertEqual(Zero._make([]), ()) self.assertEqual(repr(Zero()), 'Zero()') self.assertEqual(Zero()._asdict(), {}) self.assertEqual(Zero()._fields, ()) Dot = namedtuple('Dot', 'd') self.assertEqual(Dot(1), (1,)) self.assertEqual(Dot._make([1]), (1,)) self.assertEqual(Dot(1).d, 1) self.assertEqual(repr(Dot(1)), 'Dot(d=1)') self.assertEqual(Dot(1)._asdict(), {'d':1}) self.assertEqual(Dot(1)._replace(d=999), (999,)) self.assertEqual(Dot(1)._fields, ('d',)) # n = 5000 n = 254 # SyntaxError: more than 255 arguments: import string, random names = list(set(''.join([random.choice(string.ascii_letters) for j in range(10)]) for i in range(n))) n = len(names) Big = namedtuple('Big', names) b = Big(*range(n)) self.assertEqual(b, tuple(range(n))) self.assertEqual(Big._make(range(n)), tuple(range(n))) for pos, name in enumerate(names): self.assertEqual(getattr(b, name), pos) repr(b) # make sure repr() doesn't blow-up d = b._asdict() d_expected = dict(zip(names, range(n))) self.assertEqual(d, d_expected) b2 = b._replace(**dict([(names[1], 999),(names[-5], 42)])) b2_expected = list(range(n)) b2_expected[1] = 999 b2_expected[-5] = 42 self.assertEqual(b2, tuple(b2_expected)) self.assertEqual(b._fields, tuple(names))
def test_instance(self): Point = namedtuple('Point', 'x y') p = Point(11, 22) self.assertEqual(p, Point(x=11, y=22)) self.assertEqual(p, Point(11, y=22)) self.assertEqual(p, Point(y=22, x=11)) self.assertEqual(p, Point(*(11, 22))) self.assertEqual(p, Point(**dict(x=11, y=22))) self.assertRaises(TypeError, Point, 1) # too few args self.assertRaises(TypeError, Point, 1, 2, 3) # too many args self.assertRaises(TypeError, eval, 'Point(XXX=1, y=2)', locals()) # wrong keyword argument self.assertRaises(TypeError, eval, 'Point(x=1)', locals()) # missing keyword argument self.assertEqual(repr(p), 'Point(x=11, y=22)') self.assertNotIn('__weakref__', dir(p)) self.assertEqual(p, Point._make([11, 22])) # test _make classmethod self.assertEqual(p._fields, ('x', 'y')) # test _fields attribute self.assertEqual(p._replace(x=1), (1, 22)) # test _replace method self.assertEqual(p._asdict(), dict(x=11, y=22)) # test _asdict method self.assertEqual(vars(p), p._asdict()) # verify that vars() works try: p._replace(x=1, error=2) except ValueError: pass else: self._fail('Did not detect an incorrect fieldname') # verify that field string can have commas Point = namedtuple('Point', 'x, y') p = Point(x=11, y=22) self.assertEqual(repr(p), 'Point(x=11, y=22)') # verify that fieldspec can be a non-string sequence Point = namedtuple('Point', ('x', 'y')) p = Point(x=11, y=22) self.assertEqual(repr(p), 'Point(x=11, y=22)')
def test_tupleness(self): Point = namedtuple('Point', 'x y') p = Point(11, 22) self.assertIsInstance(p, tuple) self.assertEqual(p, (11, 22)) # matches a real tuple self.assertEqual(tuple(p), (11, 22)) # coercable to a real tuple self.assertEqual(list(p), [11, 22]) # coercable to a list self.assertEqual(max(p), 22) # iterable self.assertEqual(max(*p), 22) # star-able x, y = p self.assertEqual(p, (x, y)) # unpacks like a tuple self.assertEqual((p[0], p[1]), (11, 22)) # indexable like a tuple self.assertRaises(IndexError, p.__getitem__, 3) self.assertEqual(p.x, x) self.assertEqual(p.y, y) self.assertRaises(AttributeError, eval, 'p.z', locals())
from wpull.scraper.util import urljoin_safe from wpull.url import parse_url_or_log from wpull.writer import NullWriter _logger = logging.getLogger(__name__) _ = gettext.gettext GLOB_CHARS = frozenset('[]*?') FTPProcessorFetchParams = namedlist.namedtuple( 'FTPProcessorFetchParamsType', [ ('remove_listing', True), ('glob', True), ('preserve_permissions', False), ('retr_symlinks', True), ] ) '''FTPProcessorFetchParams Args: remove_listing (bool): Remove `.listing` files after fetching. glob (bool): Enable URL globbing. preserve_permissions (bool): Preserve file permissions. follow_symlinks (bool): Follow symlinks. ''' FTPProcessorInstances = namedlist.namedtuple( 'FTPProcessorInstancesType',
import namedlist import asyncio from wpull.driver.process import Process import wpull.util _logger = logging.getLogger(__name__) PhantomJSDriverParams = namedlist.namedtuple('PhantomJSDriverParamsType', [ 'url', ('snapshot_paths', []), ('wait_time', 1), ('num_scrolls', 10), ('smart_scroll', True), ('snapshot', True), ('viewport_size', (1200, 1920)), ('paper_size', (2400, 3840)), ('event_log_filename', None), ('action_log_filename', None), ('custom_headers', {}), ('page_settings', {}), ]) '''PhantomJS Driver parameters Attributes: url (str): URL of page to fetch. snapshot_type (list): List of filenames. Accepted extensions are html, pdf, png, gif. wait_time (float): Time between page scrolls. num_scrolls (int): Maximum number of scrolls. smart_scroll (bool): Whether to stop scrolling if number of
from wpull.protocol.ftp.util import FTPServerError from wpull.scraper.util import urljoin_safe from wpull.url import parse_url_or_log, URLInfo from wpull.writer import NullWriter, BaseFileWriter _logger = StyleAdapter(logging.getLogger(__name__)) _ = gettext.gettext GLOB_CHARS = frozenset('[]*?') FTPProcessorFetchParams = namedlist.namedtuple( 'FTPProcessorFetchParamsType', [ ('remove_listing', True), ('glob', True), ('preserve_permissions', False), ('retr_symlinks', True), ] ) '''FTPProcessorFetchParams Args: remove_listing (bool): Remove `.listing` files after fetching. glob (bool): Enable URL globbing. preserve_permissions (bool): Preserve file permissions. follow_symlinks (bool): Follow symlinks. ''' class HookPreResponseBreak(ProtocolError):
for session in self._sessions: session.response_data(data) def __exit__(self, *args): for context in self._contexts: context.__exit__(*args) WARCRecorderParams = namedlist.namedtuple( 'WARCRecorderParamsType', [ ('compress', True), ('extra_fields', None), ('temp_dir', None), ('log', True), ('appending', False), ('digests', True), ('cdx', None), ('max_size', None), ('move_to', None), ('url_table', None), ('software_string', None) ] ) ''':class:`WARCRecorder` parameters. Args: compress (bool): If True, files will be compressed with gzip extra_fields (list): A list of key-value pairs containing extra metadata fields temp_dir (str): Directory to use for temporary files log (bool): Include the program logging messages in the WARC file
from trollius import From, Return from wpull.backport.logging import BraceMessage as __ from wpull.document.html import HTMLReader from wpull.body import Body from wpull.driver.phantomjs import PhantomJSDriverParams from wpull.namevalue import NameValueRecord from wpull.warc import WARCRecord import wpull.url PhantomJSParams = namedlist.namedtuple('PhantomJSParamsType', [ ('snapshot_types', ('html', 'pdf')), ('wait_time', 1), ('num_scrolls', 10), ('smart_scroll', True), ('snapshot', True), ('viewport_size', (1200, 1920)), ('paper_size', (2400, 3840)), ('load_time', 900), ('custom_headers', {}), ('page_settings', {}), ]) '''PhantomJS parameters Attributes: snapshot_type (list): File types. Accepted are html, pdf, png, gif. wait_time (float): Time between page scrolls. num_scrolls (int): Maximum number of scrolls. smart_scroll (bool): Whether to stop scrolling if number of requests & responses do not change. snapshot (bool): Whether to take snapshot files. viewport_size (tuple): Width and height of the page viewport.
It must call one of :meth:`.engine.URLItem.set_status` or :meth:`.engine.URLItem.skip`. ''' pass def close(self): '''Run any clean up actions.''' pass WebProcessorFetchParams = namedlist.namedtuple( 'WebProcessorFetchParamsType', [ ('retry_connrefused', False), ('retry_dns_error', False), ('post_data', None), ('strong_redirects', True), ('content_on_error', False), ] ) '''WebProcessorFetchParams Args: retry_connrefused: If True, don't consider a connection refused error to be a permanent error. retry_dns_error: If True, don't consider a DNS resolution error to be permanent error. post_data (str): If provided, all requests will be POSTed with the given `post_data`. `post_data` must be in percent-encoded query format ("application/x-www-form-urlencoded"). strong_redirects (bool): If True, redirects are allowed to span hosts.
def __init__(self): self.Result = namedtuple('Result', ['type', 'text', 'result'])
def __init__(self, field_names, default=NO_DEFAULT, filters=None): self.args_tuple = namedtuple('_ArgsTuple', field_names, default) self.fields = self.args_tuple._fields self.filters = (_validate_filters(self.fields, filters) if filters else {})
def __init__(self, *args, **kwargs): self.size_limit = kwargs.pop("size_limit", None) OrderedDict.__init__(self, *args, **kwargs) self._check_size_limit() def __setitem__(self, key, value): OrderedDict.__setitem__(self, key, value) self._check_size_limit() def _check_size_limit(self): if self.size_limit is not None: while len(self) > self.size_limit: self.popitem(last=False) EpicComment = namedlist.namedtuple('EpicComment', [('id', ''), ('submission', '')]) class EpicValidator(CommentValidator): __slots__ = ['_sticky_store', '_comment_store'] def __init__(self, reddit): super().__init__(reddit) self._sticky_store = LimitedSizeDict(size_limit=20) self._comment_store = deque(maxlen=200) def validate(self, comment: Comment) -> Tuple[Action, Rule]: css_class = comment.author_flair_css_class if not self.has_comment(comment) and css_class and css_class.lower( ) in self.config['general']['class']: self._comment_store.appendleft(
_ = gettext.gettext _logger = logging.getLogger(__name__) DEFAULT_BUFFER_SIZE = 1048576 '''Default buffer size in bytes.''' DEFAULT_NO_CONTENT_CODES = frozenset( itertools.chain(range(100, 200), [http.client.NO_CONTENT, http.client.NOT_MODIFIED])) '''Status codes where a response body is prohibited.''' ConnectionParams = namedlist.namedtuple('ConnectionParamsType', [ ('bind_address', None), ('keep_alive', True), ('ssl_options', None), ('connect_timeout', None), ('read_timeout', None), ('buffer_size', DEFAULT_BUFFER_SIZE), ('no_content_codes', DEFAULT_NO_CONTENT_CODES), ('ignore_length', False), ]) '''Parameters for connections. Args: bind_address: The IP address to bind the socket. Must match :meth:`socket.SocketType.bind`. Use this if your local host has multiple IP addresses. keep_alive (bool): If True, use HTTP keep-alive. ssl_options: A ``dict`` containing options for :func:`ssl.wrap_socket` connect_timeout (float): If given, the time in seconds before the connection is timed out during connection. Otherwise, depend on the underlying libraries for timeout.
from wpull.driver.process import Process import wpull.util _logger = logging.getLogger(__name__) PhantomJSDriverParams = namedlist.namedtuple( 'PhantomJSDriverParamsType', [ 'url', ('snapshot_paths', []), ('wait_time', 1), ('num_scrolls', 10), ('smart_scroll', True), ('snapshot', True), ('viewport_size', (1200, 1920)), ('paper_size', (2400, 3840)), ('event_log_filename', None), ('action_log_filename', None), ('custom_headers', {}), ('page_settings', {}), ] ) '''PhantomJS Driver parameters Attributes: url (str): URL of page to fetch. snapshot_type (list): List of filenames. Accepted extensions are html, pdf, png, gif. wait_time (float): Time between page scrolls. num_scrolls (int): Maximum number of scrolls.
'query', 'fragment', 'username', 'password', 'hostname', 'port', 'raw', 'encoding', ] ) NormalizationParams = namedlist.namedtuple( 'NormalizationParamsType', [ ('sort_query', False), ('always_delim_query', False) ] ) '''Parameters for URL normalization. Args: sort_query (bool): Whether to sort the query string items. always_delim_query: Whether to always deliminate the key-value items where value is empty. ''' class URLInfo(_URLInfoType): '''A named tuple containing the parts of the URL.
def test_factory_doc_attr(self): Point = namedtuple('Point', 'x y') self.assertEqual(Point.__doc__, 'Point(x, y)')
self.assertEqual(Point.__doc__, "Point(dx=FACTORY({0}), dy=FACTORY({0}), dz=11.0)".format(list_repr)) Point = namedlist('Point', ['dx', 'dy', ('dz', FACTORY(11.0))], default=[]) self.assertEqual(Point.__doc__, 'Point(dx=[], dy=[], dz=FACTORY(11.0))') def test_slice(self): Point = namedlist('Point', 'x y z color') values = [3, 5, -12, 'red'] p = Point(*values) self.assertEqual(values[0:-1], p[0:-1]) self.assertEqual(values[:3], p[:3]) self.assertEqual(values[4:1:-1], p[4:1:-1]) TestNT = namedtuple('TestNT', 'x y z') # type used for pickle tests class TestNamedTuple(unittest.TestCase): def test_unicode_identifiers(self): Point = namedtuple(u'Point', u'x y') p = Point(10, 20) self.assertEqual((p.x, p.y), (10, 20)) self.assertEqual(p._asdict(), {'x':10, 'y':20}) def test_factory(self): Point = namedtuple('Point', 'x y') self.assertEqual(Point.__name__, 'Point') self.assertEqual(Point.__slots__, ()) self.assertEqual(Point.__module__, __name__) self.assertEqual(Point.__getitem__, tuple.__getitem__)
def test_unicode_identifiers(self): Point = namedtuple(u'Point', u'x y') p = Point(10, 20) self.assertEqual((p.x, p.y), (10, 20)) self.assertEqual(p._asdict(), {'x':10, 'y':20})
"""Listing parser.""" import re import itertools import namedlist from wpull.protocol.ftp.ls.date import parse_datetime import wpull.protocol.ftp.ls.date FileEntry = namedlist.namedtuple( "FileEntryType", ["name", ("type", None), ("size", None), ("date", None), ("dest", None), ("perm", None)] ) """A row in a listing. Attributes: name (str): Filename. type (str, None): ``file``, ``dir``, ``symlink``, ``other``, ``None`` size (int, None): Size of file. date (:class:`datetime.datetime`, None): A datetime object in UTC. dest (str, None): Destination filename for symlinks. perm (int, None): Unix permissions expressed as an integer. """ class ListingError(ValueError): """Error during parsing a listing.""" class UnknownListingError(ListingError): """Failed to determine type of listing."""
'''Base classes''' import abc import collections import io import namedlist from wpull.document.base import BaseTextStreamReader, \ BaseHTMLReader, BaseExtractiveReader from wpull.scraper.util import urljoin_safe LinkContext = namedlist.namedtuple('LinkContextType', [ 'link', ('inline', False), ('linked', False), ('link_type', None), ('extra', None) ]) '''A named tuple describing a scraped link. Attributes: link (str): The link that was scraped. inline (bool): Whether the link is an embeded object. linked (bool): Whether the link links to another page. link_type: A value from :class:`.item.LinkType`. extra: Any extra info. ''' class ScrapeResult(dict): '''Links scraped from a document. This class is subclassed from ``dict`` and contains convenience methods. ''' def __init__(self, link_contexts, encoding):
from wpull.processor.base import BaseProcessor, BaseProcessorSession, \ REMOTE_ERRORS from wpull.processor.rule import FetchRule, ResultRule, ProcessingRule from wpull.url import URLInfo from wpull.writer import BaseFileWriter import wpull.string import wpull.util _logger = StyleAdapter(logging.getLogger(__name__)) _ = gettext.gettext WebProcessorFetchParams = namedlist.namedtuple( 'WebProcessorFetchParamsType', [ ('post_data', None), ('strong_redirects', True), ('content_on_error', False), ] ) '''WebProcessorFetchParams Args: post_data (str): If provided, all requests will be POSTed with the given `post_data`. `post_data` must be in percent-encoded query format ("application/x-www-form-urlencoded"). strong_redirects (bool): If True, redirects are allowed to span hosts. ''' class HookPreResponseBreak(ProtocolError): '''Hook pre-response break.'''
DEFAULT_BUFFER_SIZE = 1048576 '''Default buffer size in bytes.''' DEFAULT_NO_CONTENT_CODES = frozenset(itertools.chain( range(100, 200), [http.client.NO_CONTENT, http.client.NOT_MODIFIED] )) '''Status codes where a response body is prohibited.''' ConnectionParams = namedlist.namedtuple( 'ConnectionParamsType', [ ('bind_address', None), ('keep_alive', True), ('ssl_options', None), ('connect_timeout', None), ('read_timeout', None), ('buffer_size', DEFAULT_BUFFER_SIZE), ('no_content_codes', DEFAULT_NO_CONTENT_CODES), ('ignore_length', False), ] ) '''Parameters for connections. Args: bind_address: The IP address to bind the socket. Must match :meth:`socket.SocketType.bind`. Use this if your local host has multiple IP addresses. keep_alive (bool): If True, use HTTP keep-alive. ssl_options: A ``dict`` containing options for :func:`ssl.wrap_socket` connect_timeout (float): If given, the time in seconds before the
'''Listing parser.''' import re import namedlist from wpull.ftp.ls.date import parse_datetime FileEntry = namedlist.namedtuple( 'FileEntryType', [ 'name', ('type', None), ('size', None), ('date', None), ('dest', None), ('perm', None) ]) '''A row in a listing. Attributes: name (str): Filename. type (str, None): ``file``, ``dir``, ``symlink``, ``other``, ``None`` size (int, None): Size of file. date (:class:`datetime.datetime`, None): A datetime object in UTC. dest (str, None): Destination filename for symlinks. perm (int, None): Unix permissions expressed as an integer. ''' class ListingError(ValueError):
'nodejs6', 'git', 'julia', 'lua5', 'haskell', 'octave4', 'cpp', 'c', 'java', 'go', 'rust', } lang_aliases = dict() max_upload_size = 5 * 1024 * 1024 # 5 MB VolumeInfo = namedtuple('VolumeInfo', 'name container_path mode') _extra_volumes = { 'python3-tensorflow': [ VolumeInfo('deeplearning-samples', '/home/work/samples', 'ro'), ], 'python3-tensorflow-gpu': [ VolumeInfo('deeplearning-samples', '/home/work/samples', 'ro'), ], } restarting_kernels = {} blocking_cleans = {} async def get_extra_volumes(docker, lang): avail_volumes = (await docker.volumes.list())['Volumes']
'''Listing parser.''' import re import namedlist from wpull.ftp.ls.date import parse_datetime FileEntry = namedlist.namedtuple('FileEntryType', [ 'name', ('type', None), ('size', None), ('date', None), ('dest', None), ('perm', None) ]) '''A row in a listing. Attributes: name (str): Filename. type (str, None): ``file``, ``dir``, ``symlink``, ``other``, ``None`` size (int, None): Size of file. date (:class:`datetime.datetime`, None): A datetime object in UTC. dest (str, None): Destination filename for symlinks. perm (int, None): Unix permissions expressed as an integer. ''' class ListingError(ValueError): '''Error during parsing a listing.''' class UnknownListingError(ListingError): '''Failed to determine type of listing.'''
It must call one of :meth:`.engine.URLItem.set_status` or :meth:`.engine.URLItem.skip`. ''' pass def close(self): '''Run any clean up actions.''' pass WebProcessorFetchParams = namedlist.namedtuple( 'WebProcessorFetchParamsType', [ ('retry_connrefused', False), ('retry_dns_error', False), ('post_data', None), ('strong_robots', True), ('strong_redirects', True), ('content_on_error', False), ] ) '''WebProcessorFetchParams Args: retry_connrefused: If True, don't consider a connection refused error to be a permanent error. retry_dns_error: If True, don't consider a DNS resolution error to be permanent error. post_data (str): If provided, all requests will be POSTed with the given `post_data`. `post_data` must be in percent-encoded query format ("application/x-www-form-urlencoded").
from wpull.errors import ProtocolError from wpull.hook import HookableMixin, Actions from wpull.http.web import LoopType from wpull.processor.base import BaseProcessor, BaseProcessorSession, \ REMOTE_ERRORS from wpull.processor.rule import FetchRule, ResultRule from wpull.stats import Statistics from wpull.writer import NullWriter import wpull.string _logger = logging.getLogger(__name__) _ = gettext.gettext WebProcessorFetchParams = namedlist.namedtuple('WebProcessorFetchParamsType', [ ('post_data', None), ('strong_redirects', True), ('content_on_error', False), ]) '''WebProcessorFetchParams Args: post_data (str): If provided, all requests will be POSTed with the given `post_data`. `post_data` must be in percent-encoded query format ("application/x-www-form-urlencoded"). strong_redirects (bool): If True, redirects are allowed to span hosts. ''' WebProcessorInstances = namedlist.namedtuple('WebProcessorInstancesType', [ ('fetch_rule', FetchRule()), ('result_rule', ResultRule()), ('processing_rule', None),
from wpull.body import Body from wpull.driver.phantomjs import PhantomJSDriverParams, PhantomJSDriver from wpull.namevalue import NameValueRecord from wpull.pipeline.session import ItemSession from wpull.processor.rule import ProcessingRule from wpull.warc.format import WARCRecord import wpull.url PhantomJSParams = namedlist.namedtuple( 'PhantomJSParamsType', [ ('snapshot_types', ('html', 'pdf')), ('wait_time', 1), ('num_scrolls', 10), ('smart_scroll', True), ('snapshot', True), ('viewport_size', (1200, 1920)), ('paper_size', (2400, 3840)), ('load_time', 900), ('custom_headers', {}), ('page_settings', {}), ] ) '''PhantomJS parameters Attributes: snapshot_type (list): File types. Accepted are html, pdf, png, gif. wait_time (float): Time between page scrolls. num_scrolls (int): Maximum number of scrolls. smart_scroll (bool): Whether to stop scrolling if number of requests & responses do not change. snapshot (bool): Whether to take snapshot files.
from wpull.http.web import LoopType from wpull.processor.base import BaseProcessor, BaseProcessorSession, \ REMOTE_ERRORS from wpull.processor.rule import FetchRule, ResultRule from wpull.stats import Statistics from wpull.writer import NullWriter import wpull.string _logger = logging.getLogger(__name__) _ = gettext.gettext WebProcessorFetchParams = namedlist.namedtuple( 'WebProcessorFetchParamsType', [ ('post_data', None), ('strong_redirects', True), ('content_on_error', False), ] ) '''WebProcessorFetchParams Args: post_data (str): If provided, all requests will be POSTed with the given `post_data`. `post_data` must be in percent-encoded query format ("application/x-www-form-urlencoded"). strong_redirects (bool): If True, redirects are allowed to span hosts. ''' WebProcessorInstances = namedlist.namedtuple( 'WebProcessorInstancesType', [
from os import path from namedlist import namedtuple import yaml Config = namedtuple('Config', ['components', 'component_aliases', 'templates', 'state_codes']) def load_config(template_path): templates = {} components = {} component_aliases = {} state_codes = {} if not path.isdir(template_path): raise IOError('Address formatting templates path cannot be found.') # Parse components and component aliases with open(path.join(template_path, 'components.yaml'), 'r') as ymlfile: comps = yaml.safe_load_all(ymlfile) for comp in comps: if 'aliases' in comp: component_aliases.update({alias: comp['name'] for alias in comp['aliases']}) components[comp['name']] = comp.get('aliases') # Parse templates with open(path.join(template_path, 'countries', 'worldwide.yaml'), 'r') as ymlfile: templates = yaml.safe_load(ymlfile) # Parse state codes
session.response(response) def response_data(self, data): for session in self._sessions: session.response_data(data) def __exit__(self, *args): for context in self._contexts: context.__exit__(*args) WARCRecorderParams = namedlist.namedtuple('WARCRecorderParamsType', [('compress', True), ('extra_fields', None), ('temp_dir', None), ('log', True), ('appending', False), ('digests', True), ('cdx', None), ('max_size', None), ('url_table', None), ('software_string', None)]) ''':class:`WARCRecorder` parameters. Args: compress (bool): If True, files will be compressed with gzip extra_fields (list): A list of key-value pairs containing extra metadata fields temp_dir (str): Directory to use for temporary files log (bool): Include the program logging messages in the WARC file appending (bool): If True, the file is not overwritten upon opening digests (bool): If True, the SHA1 hash digests will be written. cdx (bool): If True, a CDX file will be written.
"""Base classes""" import abc import collections import io import namedlist from wpull.document.base import BaseTextStreamReader, BaseHTMLReader, BaseExtractiveReader from wpull.scraper.util import urljoin_safe LinkContext = namedlist.namedtuple( "LinkContextType", ["link", ("inline", False), ("linked", False), ("link_type", None), ("extra", None)] ) """A named tuple describing a scraped link. Attributes: link (str): The link that was scraped. inline (bool): Whether the link is an embeded object. linked (bool): Whether the link links to another page. link_type: A value from :class:`.item.LinkType`. extra: Any extra info. """ class ScrapeResult(dict): """Links scraped from a document. This class is subclassed from ``dict`` and contains convenience methods. """ def __init__(self, link_contexts, encoding):