def crawl(self, url=None): if url == None: url = self.target_url href_links = self.extract_links_from(self, url) for link in href_links: link = urlparse2.urljoin(url, link) # will append target_url if missing from link if '#' in link: link = link.split('#')[0] if self.target_url in link and link not in self.target_links and link not in self.links_to_ingore: self.target_links.append(link) print(link) self.crawl(link) # recursively searches links
def extract_links(r): for x in r.find_all('a'): all_url.append(x.get('href')) for url in all_url: try: open_ = requests.get(url) response = open_.content.decode('utf-8') soup = BeautifulSoup(response, 'html.parser') for link in soup.find_all('a'): if link.startswith('/'): link = urljoin(url, link) if re.findall( r'(automobile)|(Automobile)|(AUTOMOBILE)|(AUTOMOTIVE)|(Automotive)|(automotive)', str(link)): links.append(link.get('href')) except Exception as e: print(e) return links
def submit_form(self, form, value, url): action = form.get('action') post_url = urlparse2.urljoin(url, action) method = form.get("method") inputs_list = form.findAll('input') post_data = {} for input in inputs_list: input_name = input.get('name') input_type = input.get('type') input_value = input.get('value') if input_type == 'text': input_value = value post_data[input_name] = input_value if method == 'post' return self.session.post(post_url, data=post_data) return requests.get(post_url, params=post_data)
def _extract_web_url(self, html, url, domain): u"""从html内容中解析出同域名网址列表. Args: html: 待解析的内容 url: 爬取页面的地址 domain: 当前网站域名 Return: html内容中的同域名网址 """ url_list = [] content = BeautifulSoup(html).findAll('a') for item in content: href = item.get('href') ans = urlparse2.urljoin(url, href) ans_netloc = urlparse2.urlparse(ans).netloc if domain == ans_netloc: url_list.append(ans) return url_list
return requests.get(url) except requests.exceptions.ConnectionError: pass options = get_arguments() target_url = options.target response = requests(target_url) parsed_html = BeautifulSoup(response.content) forms_list = parsed_html.findAll("form") for form in forms_list: action = form.get('action') post_url = urlparse2.urljoin(target_url, action) method = form.get("method") inputs_list = form.findAll('input') post_data = {} for input in inputs_list: input_name = input.get('name') input_type = input.get('type') input_value = input.get('value') if input_type == 'text': input_value = 'test' post_data[input_name] = input_value result = requests.post(post_url, data=post_data)
def join_url_path(url, relative_url): return urlparse2.urljoin(url, relative_url)
benchmark("urlparse4", lambda url: urlparse4.urlsplit(url)) benchmark("pygurl", lambda url: pygurl.ParseStandard(url)) benchmark("uritools", lambda url: uritools_urisplit(url)) benchmark("yurl", lambda url: yurl_url(url)) benchmark("urlparse2", lambda url: urlparse2.urlsplit(url)) benchmark("urlparse", lambda url: urlparse.urlsplit(url)) benchmark("cyuri", lambda url: cyuri_parser.components(url)) title("urljoin_sibling") benchmark("urlparse4", lambda url: urlparse4.urljoin(url, "sibling.html?q=1#e=b")) benchmark("pygurl", lambda url: pygurl.URL(url).Resolve("sibling.html?q=1#e=b")) benchmark("uritools", lambda url: uritools_urijoin(url, "sibling.html?q=1#e=b")) benchmark("yurl", lambda url: yurl_url(url) + yurl_url("sibling.html?q=1#e=b")) benchmark("urlparse2", lambda url: urlparse2.urljoin(url, "sibling.html?q=1#e=b")) benchmark("urlparse", lambda url: urlparse.urljoin(url, "sibling.html?q=1#e=b")) benchmark("cyuri", lambda url: cyuri_parser.join(url, "sibling.html?q=1#e=b")) # Not very representative because some libraries have functions to access the host directly without parsing the rest. # Might still be useful for some people! title("hostname") benchmark("urlparse4", lambda url: urlparse4.urlsplit(url).hostname) benchmark("pygurl", lambda url: pygurl.URL(url).host()) benchmark("uritools", lambda url: uritools_urisplit(url).host) benchmark("yurl", lambda url: yurl_url(url).host) benchmark("urlparse2", lambda url: urlparse2.urlsplit(url).hostname) benchmark("urlparse", lambda url: urlparse.urlsplit(url).hostname) benchmark("cyuri", lambda url: cyuri_parser.components(url)["host"]) # Very slow!
# -*- coding: utf-8 -*- """ Created on Sun Dec 24 18:26:43 2017 @author: aditya royal """ all_url = list() links = list() emails = dict() import requests import re from urlparse2 import urljoin from bs4 import BeautifulSoup r = BeautifulSoup(open('NHVTMA.html'), 'html.parser') for x in r.find_all('a'): all_url.append(x.get('href')) for url in all_url: try: open_ = requests.get(url) response = open_.content.decode('utf-8') soup = BeautifulSoup(response, 'html.parser') for link in soup.find_all('a'): if link.get_text(strip=True).startswith('/'): link = urljoin(url, link) except Exception as e: print e
import urlparse2 from bs4 import BeautifulSoup url="http://ing.pub.ro/en/" qDeVizitat=[url] qVizitate=[url] while len(qDeVizitat)!=0: htmltext=urllib.urlopen(qDeVizitat[0]).read() ciorba=BeautifulSoup(htmltext) qDeVizitat.pop(0) for tag in ciorba.findAll('a' ,href=True): tag['href'] =urlparse2.urljoin(url,tag['href']) if url in tag['href'] and tag['href'] not in qVizitate: qDeVizitat.append(tag['href']) qVizitate.append(tag['href']) print (qVizitate)
title("urlsplit") benchmark("urlparse4", lambda url: urlparse4.urlsplit(url)) benchmark("pygurl", lambda url: pygurl.ParseStandard(url)) benchmark("uritools", lambda url: uritools_urisplit(url)) benchmark("yurl", lambda url: yurl_url(url)) benchmark("urlparse2", lambda url: urlparse2.urlsplit(url)) benchmark("urlparse", lambda url: urlparse.urlsplit(url)) benchmark("cyuri", lambda url: cyuri_parser.components(url)) title("urljoin_sibling") benchmark("urlparse4", lambda url: urlparse4.urljoin(url, "sibling.html?q=1#e=b")) benchmark("pygurl", lambda url: pygurl.URL(url).Resolve("sibling.html?q=1#e=b")) benchmark("uritools", lambda url: uritools_urijoin(url, "sibling.html?q=1#e=b")) benchmark("yurl", lambda url: yurl_url(url) + yurl_url("sibling.html?q=1#e=b")) benchmark("urlparse2", lambda url: urlparse2.urljoin(url, "sibling.html?q=1#e=b")) benchmark("urlparse", lambda url: urlparse.urljoin(url, "sibling.html?q=1#e=b")) benchmark("cyuri", lambda url: cyuri_parser.join(url, "sibling.html?q=1#e=b")) # Not very representative because some libraries have functions to access the host directly without parsing the rest. # Might still be useful for some people! title("hostname") benchmark("urlparse4", lambda url: urlparse4.urlsplit(url).hostname) benchmark("pygurl", lambda url: pygurl.URL(url).host()) benchmark("uritools", lambda url: uritools_urisplit(url).host) benchmark("yurl", lambda url: yurl_url(url).host) benchmark("urlparse2", lambda url: urlparse2.urlsplit(url).hostname) benchmark("urlparse", lambda url: urlparse.urlsplit(url).hostname) benchmark("cyuri", lambda url: cyuri_parser.components(url)["host"]) # Very slow!