def cleanup_proxy_list(cls, proxy_list): lines = [line.strip() for line in proxy_list] return list({ add_http_if_no_scheme(url) for url in lines if url and not url.startswith('#') })
def run(self, args, opts): url = args[0] if args else None if url: url = add_http_if_no_scheme(url) spider_loader = self.crawler_process.spider_loader spidercls = DefaultSpider if opts.spider: spidercls = spider_loader.load(opts.spider) elif url: spidercls = spidercls_for_request(spider_loader, Request(url), spidercls, log_multiple=True) # The crawler is created this way since the Shell manually handles the # crawling engine, so the set up in the crawl method won't work crawler = self.crawler_process._create_crawler(spidercls) # The Shell class needs a persistent engine in the crawler crawler.engine = crawler._create_engine() crawler.engine.start() self._start_crawler_thread() shell = Shell(crawler, update_vars=self.update_vars, code=opts.code) shell.start(url=url)
def test_protocol_relative_complete_url(self): self.assertEqual( add_http_if_no_scheme( '//username:[email protected]:80/some/page/do?a=1&b=2&c=3#frag' ), 'http://*****:*****@www.example.com:80/some/page/do?a=1&b=2&c=3#frag' )
def load_ninja(self, ninja_key, proxy_list, backoff=None): proxyList = [] if not(ninja_key is None): r = requests.get(url='https://scrapy.ninja/get_proxy.php?lic=%s' % ninja_key) for i in r.json()['proxies']: proxyList.append("http://%s/" % i) if not(proxy_list is None): for i in proxy_list: proxyList.append("http://%s/" % i) lines = [line.strip() for line in proxyList] proxyList = list({ add_http_if_no_scheme(url) for url in lines if url and not url.startswith('#') }) self.proxies = {url: ProxyState() for url in proxyList} self.proxies_by_hostport = { extract_proxy_hostport(proxy): proxy for proxy in self.proxies } self.unchecked = set(self.proxies.keys()) self.good = set() self.dead = set() if backoff is None: backoff = exp_backoff_full_jitter self.backoff = backoff
def read_urls(fp): """ Read a file with urls, one url per line. """ for line in fp: url = line.strip() if not url: continue if url == 'url': continue # optional header yield add_http_if_no_scheme(url)
def _get_urls(self, fp): for row in fp: url = row.strip() if not url: continue if url == 'url': continue # optional header url = add_http_if_no_scheme(url) yield url
def __init__(self, url, search_terms=None, *args, **kwargs): if url.startswith('.'): with open(url) as f: urls = [line.strip() for line in f] else: urls = [url] self.start_urls = [add_http_if_no_scheme(_url) for _url in urls] self.search_terms = search_terms self._extra_search_terms = None # lazy-loaded via extra_search_terms self._reset_link_extractors() self.images_link_extractor = LinkExtractor( tags=['img'], attrs=['src'], deny_extensions=[]) self.state = {} # Load headless horseman scripts self.lua_source = load_directive('headless_horseman.lua') self.js_source = load_directive('headless_horseman.js') super().__init__(*args, **kwargs)
def __init__(self, url, search_terms=None, *args, **kwargs): if url.startswith('.'): with open(url) as f: urls = [line.strip() for line in f] else: urls = [url] self.start_urls = [add_http_if_no_scheme(_url) for _url in urls] self.search_terms = search_terms self._extra_search_terms = None # lazy-loaded via extra_search_terms self._reset_link_extractors() self.images_link_extractor = LinkExtractor( tags=['img'], attrs=['src'], deny_extensions=[]) self._files_fingerprints = set() self.state = {} self.use_splash = None # set up in start_requests # Load headless horseman scripts self.lua_source = load_directive('headless_horseman.lua') self.js_source = load_directive('headless_horseman.js') super().__init__(*args, **kwargs)
def __init__(self, url, search_terms=None, *args, **kwargs): if url.startswith('.') or url.startswith('/'): with Path(url).open('rt', encoding='utf8') as f: urls = [line.strip() for line in f] else: urls = [u for u in url.split() if u] self.start_urls = [add_http_if_no_scheme(_url) for _url in urls] self.search_terms = search_terms self._extra_search_terms = None # lazy-loaded via extra_search_terms self._reset_link_extractors() self.images_link_extractor = LinkExtractor( tags=['img'], attrs=['src'], deny_extensions=[], canonicalize=False) self.state = {} self.use_splash = None # set up in start_requests self._screenshot_dest = None # type: Path # Load headless horseman scripts self.lua_source = load_directive('headless_horseman.lua') self.js_source = load_directive('headless_horseman.js') super().__init__(*args, **kwargs)
def test_preserve_http_path(self): self.assertEqual(add_http_if_no_scheme('http://www.example.com/some/page.html'), 'http://www.example.com/some/page.html')
def test_preserve_http_without_subdomain(self): self.assertEqual(add_http_if_no_scheme('http://example.com'), 'http://example.com')
def test_add_scheme(self): self.assertEqual(add_http_if_no_scheme('www.example.com'), 'http://www.example.com')
def test_without_subdomain(self): self.assertEqual(add_http_if_no_scheme("example.com"), "http://example.com")
def test_protocol_relative_query(self): self.assertEqual(add_http_if_no_scheme('//www.example.com/do?a=1&b=2&c=3'), 'http://www.example.com/do?a=1&b=2&c=3')
def test_protocol_relative_port(self): self.assertEqual(add_http_if_no_scheme('//www.example.com:80'), 'http://www.example.com:80')
def test_protocol_relative_without_subdomain(self): self.assertEqual(add_http_if_no_scheme('//example.com'), 'http://example.com')
def test_preserve_http_query(self): self.assertEqual(add_http_if_no_scheme('http://www.example.com/do?a=1&b=2&c=3'), 'http://www.example.com/do?a=1&b=2&c=3')
def test_preserve_http_port(self): self.assertEqual(add_http_if_no_scheme('http://www.example.com:80'), 'http://www.example.com:80')
def test_path(self): self.assertEqual( add_http_if_no_scheme("www.example.com/some/page.html"), "http://www.example.com/some/page.html" )
def test_protocol_relative(self): self.assertEqual(add_http_if_no_scheme("//www.example.com"), "http://www.example.com")
def test_query(self): self.assertEqual( add_http_if_no_scheme("www.example.com/do?a=1&b=2&c=3"), "http://www.example.com/do?a=1&b=2&c=3" )
def test_fragment(self): self.assertEqual( add_http_if_no_scheme("www.example.com/some/page#frag"), "http://www.example.com/some/page#frag" )
def test_port(self): self.assertEqual(add_http_if_no_scheme("www.example.com:80"), "http://www.example.com:80")
def test_preserve_http_fragment(self): self.assertEqual(add_http_if_no_scheme('http://www.example.com/some/page#frag'), 'http://www.example.com/some/page#frag')
def __init__(self, url, page_config_file, search_terms=None, *args, **kwargs): if url.startswith('.') or url.startswith('/'): with Path(url).open('rt', encoding='utf8') as f: urls = [line.strip() for line in f] else: urls = [u for u in url.split() if u] print('search terms - ' + str(search_terms)) self.start_urls = [add_http_if_no_scheme(_url) for _url in urls] self.search_terms = search_terms self._extra_search_terms = None # lazy-loaded via extra_search_terms self._reset_link_extractors() self.images_link_extractor = LinkExtractor(tags=['img'], attrs=['src'], deny_extensions=[], canonicalize=False) self.state = {} self.use_splash = None # set up in start_requests self._screenshot_dest = None # type: Path # Load headless horseman scripts self.lua_source = load_directive('headless_horseman.lua') self.js_source = load_directive('headless_horseman.js') self.forms_info = list() forms_info_str = list() # print('file path --- ' + forms_input) - added to argument list # with Path('/Users/neha/projects/openwatch/forms-info.txt').open('r', encoding='utf8') as f: # for line in f: # line = line.rstrip('\n') # print('LINE -- ' + line) # forms_info_str.append(line) #'/users/neha/projects/openwatch/page-config.json' if page_config_file: with Path(page_config_file).open('r', encoding='utf8') as f: self.pages_data = json.load(f) self.pages_cfg = self.pages_data['pagesInfo'] print(self.pages_cfg) else: self.pages_cfg = [] print('no page_config_file specified') # num_forms = len(forms_info_str)//4 # if num_forms > 0: # for x in range(num_forms): # print('form idx - ' + str(x)) # line_idx = x*4 # form_url = forms_info_str[line_idx] # form_param = json.loads(forms_info_str[line_idx + 1]) # form_data = json.loads(forms_info_str[line_idx + 2]) # form_method = forms_info_str[line_idx + 3] # form = (form_url, form_param, form_data, form_method) # self.forms_info.append(form) # print('WHOLE LIST -- ' + json.dumps(self.forms_info)) super().__init__(*args, **kwargs)
def test_preserve_http_username_password(self): self.assertEqual(add_http_if_no_scheme('http://*****:*****@www.example.com'), 'http://*****:*****@www.example.com')
def test_preserve_http_complete_url(self): self.assertEqual( add_http_if_no_scheme("http://*****:*****@www.example.com:80/some/page/do?a=1&b=2&c=3#frag"), "http://*****:*****@www.example.com:80/some/page/do?a=1&b=2&c=3#frag", )
def test_protocol_relative_path(self): self.assertEqual(add_http_if_no_scheme('//www.example.com/some/page.html'), 'http://www.example.com/some/page.html')
def test_protocol_relative_complete_url(self): self.assertEqual(add_http_if_no_scheme('//username:[email protected]:80/some/page/do?a=1&b=2&c=3#frag'), 'http://*****:*****@www.example.com:80/some/page/do?a=1&b=2&c=3#frag')
def test_protocol_relative_fragment(self): self.assertEqual(add_http_if_no_scheme('//www.example.com/some/page#frag'), 'http://www.example.com/some/page#frag')
def test_username_password(self): self.assertEqual( add_http_if_no_scheme("username:[email protected]"), "http://*****:*****@www.example.com" )
def test_protocol_relative_username_password(self): self.assertEqual(add_http_if_no_scheme('//username:[email protected]'), 'http://*****:*****@www.example.com')
def test_preserve_https(self): self.assertEqual(add_http_if_no_scheme("https://www.example.com"), "https://www.example.com")
def test_preserve_ftp(self): self.assertEqual(add_http_if_no_scheme('ftp://www.example.com'), 'ftp://www.example.com')
def rss(v): return add_http_if_no_scheme(unquote(v))