def crawl(self, url=None):
        if url == None:
            url = self.target_url

        href_links = self.extract_links_from(self, url)

        for link in href_links:
            link = urlparse2.urljoin(url, link)  # will append target_url if missing from link

            if '#' in link:
                link = link.split('#')[0]

            if self.target_url in link and link not in self.target_links and link not in self.links_to_ingore:
                self.target_links.append(link)
                print(link)
                self.crawl(link)  # recursively searches links
Beispiel #2
0
 def extract_links(r):
     for x in r.find_all('a'):
         all_url.append(x.get('href'))
     for url in all_url:
         try:
             open_ = requests.get(url)
             response = open_.content.decode('utf-8')
             soup = BeautifulSoup(response, 'html.parser')
             for link in soup.find_all('a'):
                 if link.startswith('/'):
                     link = urljoin(url, link)
                 if re.findall(
                         r'(automobile)|(Automobile)|(AUTOMOBILE)|(AUTOMOTIVE)|(Automotive)|(automotive)',
                         str(link)):
                     links.append(link.get('href'))
         except Exception as e:
             print(e)
     return links
    def submit_form(self, form, value, url):
        action = form.get('action')
        post_url = urlparse2.urljoin(url, action)
        method = form.get("method")

        inputs_list = form.findAll('input')
        post_data = {}

        for input in inputs_list:
            input_name = input.get('name')
            input_type = input.get('type')
            input_value = input.get('value')

            if input_type == 'text':
                input_value = value
            post_data[input_name] = input_value

        if method == 'post'
            return self.session.post(post_url, data=post_data)

        return requests.get(post_url, params=post_data)
Beispiel #4
0
    def _extract_web_url(self, html, url, domain):
        u"""从html内容中解析出同域名网址列表.

        Args:
            html: 待解析的内容
            url: 爬取页面的地址
            domain: 当前网站域名

        Return:
            html内容中的同域名网址

        """

        url_list = []
        content = BeautifulSoup(html).findAll('a')
        for item in content:
            href = item.get('href')
            ans = urlparse2.urljoin(url, href)
            ans_netloc = urlparse2.urlparse(ans).netloc
            if domain == ans_netloc:
                url_list.append(ans)
        return url_list
Beispiel #5
0
        return requests.get(url)
    except requests.exceptions.ConnectionError:
        pass


options = get_arguments()

target_url = options.target
response = requests(target_url)

parsed_html = BeautifulSoup(response.content)
forms_list = parsed_html.findAll("form")

for form in forms_list:
    action = form.get('action')
    post_url = urlparse2.urljoin(target_url, action)
    method = form.get("method")

    inputs_list = form.findAll('input')
    post_data = {}

    for input in inputs_list:
        input_name = input.get('name')
        input_type = input.get('type')
        input_value = input.get('value')

        if input_type == 'text':
            input_value = 'test'
        post_data[input_name] = input_value

    result = requests.post(post_url, data=post_data)
Beispiel #6
0
def join_url_path(url, relative_url):
    return urlparse2.urljoin(url, relative_url)
Beispiel #7
0
benchmark("urlparse4", lambda url: urlparse4.urlsplit(url))
benchmark("pygurl", lambda url: pygurl.ParseStandard(url))
benchmark("uritools", lambda url: uritools_urisplit(url))
benchmark("yurl", lambda url: yurl_url(url))
benchmark("urlparse2", lambda url: urlparse2.urlsplit(url))
benchmark("urlparse", lambda url: urlparse.urlsplit(url))
benchmark("cyuri", lambda url: cyuri_parser.components(url))

title("urljoin_sibling")
benchmark("urlparse4",
          lambda url: urlparse4.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("pygurl", lambda url: pygurl.URL(url).Resolve("sibling.html?q=1#e=b"))
benchmark("uritools", lambda url: uritools_urijoin(url, "sibling.html?q=1#e=b"))
benchmark("yurl", lambda url: yurl_url(url) + yurl_url("sibling.html?q=1#e=b"))
benchmark("urlparse2",
          lambda url: urlparse2.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("urlparse", lambda url: urlparse.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("cyuri", lambda url: cyuri_parser.join(url, "sibling.html?q=1#e=b"))

# Not very representative because some libraries have functions to access the host directly without parsing the rest.
# Might still be useful for some people!
title("hostname")
benchmark("urlparse4", lambda url: urlparse4.urlsplit(url).hostname)
benchmark("pygurl", lambda url: pygurl.URL(url).host())
benchmark("uritools", lambda url: uritools_urisplit(url).host)
benchmark("yurl", lambda url: yurl_url(url).host)
benchmark("urlparse2", lambda url: urlparse2.urlsplit(url).hostname)
benchmark("urlparse", lambda url: urlparse.urlsplit(url).hostname)
benchmark("cyuri", lambda url: cyuri_parser.components(url)["host"])

# Very slow!
Beispiel #8
0
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 24 18:26:43 2017

@author: aditya royal
"""
all_url = list()
links = list()
emails = dict()
import requests
import re
from urlparse2 import urljoin
from bs4 import BeautifulSoup
r = BeautifulSoup(open('NHVTMA.html'), 'html.parser')
for x in r.find_all('a'):
    all_url.append(x.get('href'))
    for url in all_url:
        try:
            open_ = requests.get(url)
            response = open_.content.decode('utf-8')
            soup = BeautifulSoup(response, 'html.parser')
            for link in soup.find_all('a'):
                if link.get_text(strip=True).startswith('/'):
                    link = urljoin(url, link)
        except Exception as e:
            print e
import urlparse2

from bs4 import BeautifulSoup



url="http://ing.pub.ro/en/"

qDeVizitat=[url]
qVizitate=[url]


while len(qDeVizitat)!=0:

    htmltext=urllib.urlopen(qDeVizitat[0]).read()




    ciorba=BeautifulSoup(htmltext)

    qDeVizitat.pop(0)

    for tag in ciorba.findAll('a'  ,href=True):
        tag['href'] =urlparse2.urljoin(url,tag['href'])
        if url in tag['href'] and tag['href'] not in qVizitate:
            qDeVizitat.append(tag['href'])
            qVizitate.append(tag['href'])

print (qVizitate)
Beispiel #10
0
title("urlsplit")
benchmark("urlparse4", lambda url: urlparse4.urlsplit(url))
benchmark("pygurl", lambda url: pygurl.ParseStandard(url))
benchmark("uritools", lambda url: uritools_urisplit(url))
benchmark("yurl", lambda url: yurl_url(url))
benchmark("urlparse2", lambda url: urlparse2.urlsplit(url))
benchmark("urlparse", lambda url: urlparse.urlsplit(url))
benchmark("cyuri", lambda url: cyuri_parser.components(url))

title("urljoin_sibling")
benchmark("urlparse4", lambda url: urlparse4.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("pygurl", lambda url: pygurl.URL(url).Resolve("sibling.html?q=1#e=b"))
benchmark("uritools", lambda url: uritools_urijoin(url, "sibling.html?q=1#e=b"))
benchmark("yurl", lambda url: yurl_url(url) + yurl_url("sibling.html?q=1#e=b"))
benchmark("urlparse2", lambda url: urlparse2.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("urlparse", lambda url: urlparse.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("cyuri", lambda url: cyuri_parser.join(url, "sibling.html?q=1#e=b"))

# Not very representative because some libraries have functions to access the host directly without parsing the rest.
# Might still be useful for some people!
title("hostname")
benchmark("urlparse4", lambda url: urlparse4.urlsplit(url).hostname)
benchmark("pygurl", lambda url: pygurl.URL(url).host())
benchmark("uritools", lambda url: uritools_urisplit(url).host)
benchmark("yurl", lambda url: yurl_url(url).host)
benchmark("urlparse2", lambda url: urlparse2.urlsplit(url).hostname)
benchmark("urlparse", lambda url: urlparse.urlsplit(url).hostname)
benchmark("cyuri", lambda url: cyuri_parser.components(url)["host"])

# Very slow!