Python DEFAULT_INPUT Examples

Programming Language: Python

Namespace/Package Name: hcicrawler.samples

Class/Type: DEFAULT_INPUT

Examples at hotexamples.com: 6

Python DEFAULT_INPUT - 6 examples found. These are the top rated real world Python examples of hcicrawler.samples.DEFAULT_INPUT extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

copy(3)

Example #1

Show file

File: pages.py Project: noscripter/hyphe

 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['maxdepth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [
         url_to_lru_clean(
             "http%s://%s" %
             (https, u.replace('http://', '').replace('https://', '')))
         for u in to_list(args['discover_prefixes']) for https in ['', 's']
     ]
     self.resolved_links = {}
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args[
         'phantom'] and args['phantom'].lower() != "false"
     if self.phantom:
         self.ph_timeout = int(
             args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(
             args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(
             args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
     dispatcher.connect(self.closed, spider_closed)
     dispatcher.connect(self.crashed, spider_error)

Example #2

Show file

File: pages.py Project: c24b/Hypertext-Corpus-Initiative

 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['maxdepth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = to_list(args['discover_prefixes'])
     self.user_agent = args['user_agent']
     self.link_extractor = SgmlLinkExtractor(canonicalize=False, deny_extensions=[])
     self.ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

Example #3

Show file

 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['maxdepth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = to_list(args['discover_prefixes'])
     self.user_agent = args['user_agent']
     self.link_extractor = SgmlLinkExtractor(canonicalize=False,
                                             deny_extensions=[])
     self.ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

Example #4

Show file

File: pages.py Project: imclab/Hypertext-Corpus-Initiative

 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['maxdepth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [url_to_lru_clean(u) for u in to_list(args['discover_prefixes'])]
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
     if self.phantom:
         self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
     dispatcher.connect(self.closed, spider_closed)
     dispatcher.connect(self.crashed, spider_error)

Example #5

Show file

 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['max_depth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
     self.resolved_links = {}
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
     self.cookies = None
     if 'cookies' in args:
         self.cookies = dict(cookie.split('=', 1) for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie)
     if self.phantom:
         self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0

Example #6

Show file

File: pages.py Project: medialab/hyphe

 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['max_depth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
     self.resolved_links = {}
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
     self.cookies = None
     if 'cookies' in args:
         self.cookies = dict(cookie.split('=') for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie)
     if self.phantom:
         self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
     dispatcher.connect(self.closed, spider_closed)
     dispatcher.connect(self.crashed, spider_error)