Exemple #1
0
 def __init__(self,
              url,
              number_of_threads=20,
              allowed_urls=[],
              blocked_urls=[],
              basic_auth=(),
              depth=-1):
     self.url = url
     self.number_of_threads = number_of_threads
     self.allowed_urls = allowed_urls
     # self.blocked_urls = blocked_urls
     self.lost_url = set()
     self.basic_auth = basic_auth
     self.depth = depth
     self.crawl = True
     self.visited = {}
     self.general_visited = set()
     self.unvisited = set()
     self.general_unvisited = {self.url}
     self.fetched_url_record = dict()
     self.csv_table = CsvFormat([
         "url", "status code", "title", "keyword", "description", "h1",
         "h2", "h3", "h4", "h5", "h6", "index", "open tags",
         "external links", "h_tag_format"
     ])
     self.downloaded_pages = {}
     self.record = []
     self.url_parser = UrlParser(url)
     self.parser = HtmlParser()
     self.filemanager = FileManager()
Exemple #2
0
 def __init__(self):
     ''' ok '''
     self.homeUrls = [
         'http://www.cau.edu.cn',
         'http://www.google.com.hk',
         'http://www.baidu.com',
     ]
     self.urlparser = UrlParser(self.homeUrls)
Exemple #3
0
    def get_url(self, html, domain):
        url_parser = UrlParser(domain)
        netloc = url_parser.get_netloc(domain)
        urls = set()
        try:
            for bs_object in html.find_all(["a", "img"]):
                raw_url = ""
                if "href" in bs_object.attrs:
                    raw_url = bs_object.attrs["href"]
                elif "src" in bs_object.attrs:
                    raw_url = bs_object.attrs["src"]
                else:
                    continue

                if not url_parser.is_internal(raw_url, url_parser.domain):
                    continue
                if not url_parser.pretty_url(raw_url, url_parser.domain):
                    continue
                if url_parser.pretty_url(raw_url,
                                         url_parser.domain).count(netloc) > 1:
                    continue
                if "tel:" in raw_url.lower():
                    continue
                if "mailto:" in raw_url.lower():
                    continue
                url = url_parser.pretty_url(raw_url, url_parser.domain)
                urls.add(url)
        except Exception as e:
            self.filemanager.save_to_log(e)
        return urls
Exemple #4
0
    def get_broken_a_tags(self, html, domain, current_url):
        html_soup = BeautifulSoup(html, "lxml")
        url = UrlParser(domain)
        rel = ""
        urls = ""
        for bs_object in html_soup.find_all("a"):
            if "rel" in bs_object.attrs:
                rel = bs_object["rel"]
            else:
                rel = "rel 属性はありません"
            if not ("href" in bs_object.attrs):
                continue
            line = self.find_line(html, bs_object["href"])
            if url.is_external(bs_object["href"], domain):
                urls = urls + f"{line}行目、 外部 url: {bs_object['href']} rel属性: {rel}" + "\n"
                continue

        return urls
Exemple #5
0
class debug_UrlParser:
    def __init__(self):
        ''' ok '''
        self.homeUrls = [
            'http://www.cau.edu.cn',
            'http://www.google.com.hk',
            'http://www.baidu.com',
        ]
        self.urlparser = UrlParser(self.homeUrls)

    @dec
    def transToStdUrl(self):
        ''' ok '''
        homeurl = 'http://www.google.com/hello/world'
        print 'homeurl',homeurl
        url = [
            '../index.html',
            './world/trying/tofind/right.html',
        ]
        for u in url:
            print 'url',u
            print 'stdurl',self.urlparser.transToStdUrl(homeurl, u)
            print '-'*20

    @dec
    def transSiteID(self):
        ''' ok '''
        url = [
            'http://www.cau.edu.cn/index.php',
            'http://www.google.com.hk/helllo/werod',
        ]
        for u in url:
            print u, '\r',self.urlparser.transSiteID(u), '\r'

    @dec
    def transPath(self):
        ''' ok '''
        pageurl = "http://www.cau.edu.cn/hello/index.html"
        url = "../index"
        print 'pageurl', pageurl
        print 'url', url
        print 'path', self.urlparser.transPath(pageurl, url)

    @dec
    def transNetloc(self):
        ''' ok '''
        pageurl = "http://www.cau.edu.cn/hello/index.html"
        print self.urlparser.transNetloc(pageurl)

    @dec
    def judgeUrl(self):
        ''' ok '''
        pageurl = "http://www.cau.edu.cn/hello/index.html"
        newurl = "./world.php?hdjfsa=dslkfjsaf&lkfjewoif=seklfhehw"
        print self.urlparser.judgeUrl(pageurl, newurl)
Exemple #6
0
    def request(self, url):
        #Create default ssl context to allow HTTPS connections
        context = ssl.create_default_context()

        #Clear data from previous request
        self.data = ''

        #Parse url and connect to host
        host, path = UrlParser.parse_url(url)

        #Connect to host and return if host does not exist
        try:
            sock = socket.create_connection((host, self.port))
            ssock = context.wrap_socket(sock, server_hostname=host)
        except socket.gaierror:
            print('there was an error resolving the host {}'.format(host))
            return
        else:
            with ssock:
                self.sock = ssock
                self.sock.settimeout(2)
                #Send HTTP request
                req = 'GET {} HTTP/1.1\r\nHost: {}\r\n\r\n'.format(path, host)
                print('HTTP request\n{}'.format(req))
                self.sock.send(req.encode())
                print('sent HTTP request')

                #Receive and store response
                buffer = ''
                data = self.sock.recv(1024)
                while data:
                    try:
                        print(data.decode('utf-8'), flush=True)
                        buffer += data.decode('utf-8')
                    except UnicodeError:
                        print("Could not decode a block of data using utf-8")
                    try:
                        data = self.sock.recv(1024)
                    except:
                        data = None
                self.data = buffer
                self.sock.shutdown(socket.SHUT_RDWR)
                print("\n\ndone")
Exemple #7
0
def main():
    #Check if file with urls is included
    if len(sys.argv) < 2:
        print("usage: tests.py <file with urls>")
        sys.exit()
    urls = sys.argv[1]
    #Read urls line by line and run tests. The results of each test will be written to its own file
    with open(urls, 'r') as fo:
        count = 0
        for line in fo:
            filename, path = UrlParser.parse_url(line)
            filename = filename.split('.')[0]

            filename = filename + str(count) + '.txt'
            test_request_http(line[:-1], filename)
            count += 1
            filename = filename[:-5] + str(count) + '.txt' #works up to single digit count
            test_request_https(line[:-1], filename) #Get rid of newline at end
            #filename = filename + str(count) + '.txt'
            #test_request_https(line[:-1], filename) #Get rid of newline at end
            count += 1
    sys.exit()
Exemple #8
0
    def request(self, url, params={}):
        #Create socket
        self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.sock.settimeout(2)

        #Clear data from previous request
        self.data = ''

        #Parse url and connect to host
        host, path = UrlParser.parse_url(url)
        if not self.connect(self.sock, host):
            return

        #Send HTTP request
        req = 'GET {} HTTP/1.1\r\nHost: {}\r\n\r\n'.format(path, host)
        print('HTTP request\n{}'.format(req))
        self.sock.send(req.encode())
        print('sent HTTP request')

        #Receive and store response
        buffer = ''
        data = self.sock.recv(1024)
        while data:
            try:
                buffer += data.decode('utf-8')
            except UnicodeDecodeError:
                print("Could not decode a block of data using utf-8")
            try:
                data = self.sock.recv(1024)
            except:
                data = None
        self.data = buffer

        #Close the connection now that you have everything
        self.sock.shutdown(socket.SHUT_RDWR)
        self.sock.close()
        print("connection closed")
Exemple #9
0
    else:
        target_url = 'https://sh.lianjia.com/ershoufang/' + 'pg' + str(i) + '/'
    url_to_file.dump_target_url(target_url + '\n')
url_to_file.close_file()

#从文档中读取url
urls = open('url.txt', 'r').readlines()
#print(urls)

#从url中获取href并写入txt文件
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/65.0'
}
for url in urls:
    url = url.strip('\n')
    url_soup = UrlParser(url, header).get_url_soup()
    s = UrlParser(url, header).get_url_href(url_soup)
    for item in s:
        href_to_txt = DataOutput(item).data_to_txt('href.txt')

#从href.txt文件中读取href并解析
f = open('href.txt', 'r').readlines()

for detail_href in f:
    i = f.index(detail_href)
    print('正在处理第{}个href'.format(i))
    detail_url = detail_href.strip('\n')
    try:
        global detail
        detail = UrlParser(detail_href, header)
        detail_soup = detail.get_url_soup()
Exemple #10
0
import requests
from bs4 import BeautifulSoup
from htmlparser import HtmlParser
from urlparser import UrlParser
from time import sleep
import codecs
import json
import pandas as pd

visited = set()
unvisited = set()
domain = 'www.motoji.co.jp'
siteUrl = f"https://{domain}/"
praser_url = UrlParser(siteUrl)
parser_html = HtmlParser()
DATA = []

def get_res(url):

    headers_pc = {'User-Agent': 'robot wpmake'}
    try:
        res = requests.get(url, headers=headers_pc, timeout=5.0, allow_redirects=False)
        return res
    except requests.exceptions.RequestException as e:
        print(e)
        return False

def update_data(url, status_code):

    DATA.append({"url": url, "status_code": status_code})