Example #1
0
 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['maxdepth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [
         url_to_lru_clean(
             "http%s://%s" %
             (https, u.replace('http://', '').replace('https://', '')))
         for u in to_list(args['discover_prefixes']) for https in ['', 's']
     ]
     self.resolved_links = {}
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args[
         'phantom'] and args['phantom'].lower() != "false"
     if self.phantom:
         self.ph_timeout = int(
             args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(
             args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(
             args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
     dispatcher.connect(self.closed, spider_closed)
     dispatcher.connect(self.crashed, spider_error)
 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['maxdepth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = to_list(args['discover_prefixes'])
     self.user_agent = args['user_agent']
     self.link_extractor = SgmlLinkExtractor(canonicalize=False, deny_extensions=[])
     self.ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])
Example #3
0
 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['maxdepth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = to_list(args['discover_prefixes'])
     self.user_agent = args['user_agent']
     self.link_extractor = SgmlLinkExtractor(canonicalize=False,
                                             deny_extensions=[])
     self.ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])
 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['maxdepth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [url_to_lru_clean(u) for u in to_list(args['discover_prefixes'])]
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
     if self.phantom:
         self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
     dispatcher.connect(self.closed, spider_closed)
     dispatcher.connect(self.crashed, spider_error)
Example #5
0
 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['max_depth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
     self.resolved_links = {}
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
     self.cookies = None
     if 'cookies' in args:
         self.cookies = dict(cookie.split('=', 1) for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie)
     if self.phantom:
         self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
Example #6
0
 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['max_depth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
     self.resolved_links = {}
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
     self.cookies = None
     if 'cookies' in args:
         self.cookies = dict(cookie.split('=') for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie)
     if self.phantom:
         self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
     dispatcher.connect(self.closed, spider_closed)
     dispatcher.connect(self.crashed, spider_error)