def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [ url_to_lru_clean( "http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's'] ] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args[ 'phantom'] and args['phantom'].lower() != "false" if self.phantom: self.ph_timeout = int( args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int( args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int( args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error)
def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = to_list(args['discover_prefixes']) self.user_agent = args['user_agent'] self.link_extractor = SgmlLinkExtractor(canonicalize=False, deny_extensions=[]) self.ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])
def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [url_to_lru_clean(u) for u in to_list(args['discover_prefixes'])] self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false" if self.phantom: self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error)
def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['max_depth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's']] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false" self.cookies = None if 'cookies' in args: self.cookies = dict(cookie.split('=', 1) for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie) if self.phantom: self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0
def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['max_depth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's']] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false" self.cookies = None if 'cookies' in args: self.cookies = dict(cookie.split('=') for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie) if self.phantom: self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error)