def __init__(self, url, number_of_threads=20, allowed_urls=[], blocked_urls=[], basic_auth=(), depth=-1): self.url = url self.number_of_threads = number_of_threads self.allowed_urls = allowed_urls # self.blocked_urls = blocked_urls self.lost_url = set() self.basic_auth = basic_auth self.depth = depth self.crawl = True self.visited = {} self.general_visited = set() self.unvisited = set() self.general_unvisited = {self.url} self.fetched_url_record = dict() self.csv_table = CsvFormat([ "url", "status code", "title", "keyword", "description", "h1", "h2", "h3", "h4", "h5", "h6", "index", "open tags", "external links", "h_tag_format" ]) self.downloaded_pages = {} self.record = [] self.url_parser = UrlParser(url) self.parser = HtmlParser() self.filemanager = FileManager()
def __init__(self): ''' ok ''' self.homeUrls = [ 'http://www.cau.edu.cn', 'http://www.google.com.hk', 'http://www.baidu.com', ] self.urlparser = UrlParser(self.homeUrls)
def get_url(self, html, domain): url_parser = UrlParser(domain) netloc = url_parser.get_netloc(domain) urls = set() try: for bs_object in html.find_all(["a", "img"]): raw_url = "" if "href" in bs_object.attrs: raw_url = bs_object.attrs["href"] elif "src" in bs_object.attrs: raw_url = bs_object.attrs["src"] else: continue if not url_parser.is_internal(raw_url, url_parser.domain): continue if not url_parser.pretty_url(raw_url, url_parser.domain): continue if url_parser.pretty_url(raw_url, url_parser.domain).count(netloc) > 1: continue if "tel:" in raw_url.lower(): continue if "mailto:" in raw_url.lower(): continue url = url_parser.pretty_url(raw_url, url_parser.domain) urls.add(url) except Exception as e: self.filemanager.save_to_log(e) return urls
def get_broken_a_tags(self, html, domain, current_url): html_soup = BeautifulSoup(html, "lxml") url = UrlParser(domain) rel = "" urls = "" for bs_object in html_soup.find_all("a"): if "rel" in bs_object.attrs: rel = bs_object["rel"] else: rel = "rel 属性はありません" if not ("href" in bs_object.attrs): continue line = self.find_line(html, bs_object["href"]) if url.is_external(bs_object["href"], domain): urls = urls + f"{line}行目、 外部 url: {bs_object['href']} rel属性: {rel}" + "\n" continue return urls
class debug_UrlParser: def __init__(self): ''' ok ''' self.homeUrls = [ 'http://www.cau.edu.cn', 'http://www.google.com.hk', 'http://www.baidu.com', ] self.urlparser = UrlParser(self.homeUrls) @dec def transToStdUrl(self): ''' ok ''' homeurl = 'http://www.google.com/hello/world' print 'homeurl',homeurl url = [ '../index.html', './world/trying/tofind/right.html', ] for u in url: print 'url',u print 'stdurl',self.urlparser.transToStdUrl(homeurl, u) print '-'*20 @dec def transSiteID(self): ''' ok ''' url = [ 'http://www.cau.edu.cn/index.php', 'http://www.google.com.hk/helllo/werod', ] for u in url: print u, '\r',self.urlparser.transSiteID(u), '\r' @dec def transPath(self): ''' ok ''' pageurl = "http://www.cau.edu.cn/hello/index.html" url = "../index" print 'pageurl', pageurl print 'url', url print 'path', self.urlparser.transPath(pageurl, url) @dec def transNetloc(self): ''' ok ''' pageurl = "http://www.cau.edu.cn/hello/index.html" print self.urlparser.transNetloc(pageurl) @dec def judgeUrl(self): ''' ok ''' pageurl = "http://www.cau.edu.cn/hello/index.html" newurl = "./world.php?hdjfsa=dslkfjsaf&lkfjewoif=seklfhehw" print self.urlparser.judgeUrl(pageurl, newurl)
def request(self, url): #Create default ssl context to allow HTTPS connections context = ssl.create_default_context() #Clear data from previous request self.data = '' #Parse url and connect to host host, path = UrlParser.parse_url(url) #Connect to host and return if host does not exist try: sock = socket.create_connection((host, self.port)) ssock = context.wrap_socket(sock, server_hostname=host) except socket.gaierror: print('there was an error resolving the host {}'.format(host)) return else: with ssock: self.sock = ssock self.sock.settimeout(2) #Send HTTP request req = 'GET {} HTTP/1.1\r\nHost: {}\r\n\r\n'.format(path, host) print('HTTP request\n{}'.format(req)) self.sock.send(req.encode()) print('sent HTTP request') #Receive and store response buffer = '' data = self.sock.recv(1024) while data: try: print(data.decode('utf-8'), flush=True) buffer += data.decode('utf-8') except UnicodeError: print("Could not decode a block of data using utf-8") try: data = self.sock.recv(1024) except: data = None self.data = buffer self.sock.shutdown(socket.SHUT_RDWR) print("\n\ndone")
def main(): #Check if file with urls is included if len(sys.argv) < 2: print("usage: tests.py <file with urls>") sys.exit() urls = sys.argv[1] #Read urls line by line and run tests. The results of each test will be written to its own file with open(urls, 'r') as fo: count = 0 for line in fo: filename, path = UrlParser.parse_url(line) filename = filename.split('.')[0] filename = filename + str(count) + '.txt' test_request_http(line[:-1], filename) count += 1 filename = filename[:-5] + str(count) + '.txt' #works up to single digit count test_request_https(line[:-1], filename) #Get rid of newline at end #filename = filename + str(count) + '.txt' #test_request_https(line[:-1], filename) #Get rid of newline at end count += 1 sys.exit()
def request(self, url, params={}): #Create socket self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.sock.settimeout(2) #Clear data from previous request self.data = '' #Parse url and connect to host host, path = UrlParser.parse_url(url) if not self.connect(self.sock, host): return #Send HTTP request req = 'GET {} HTTP/1.1\r\nHost: {}\r\n\r\n'.format(path, host) print('HTTP request\n{}'.format(req)) self.sock.send(req.encode()) print('sent HTTP request') #Receive and store response buffer = '' data = self.sock.recv(1024) while data: try: buffer += data.decode('utf-8') except UnicodeDecodeError: print("Could not decode a block of data using utf-8") try: data = self.sock.recv(1024) except: data = None self.data = buffer #Close the connection now that you have everything self.sock.shutdown(socket.SHUT_RDWR) self.sock.close() print("connection closed")
else: target_url = 'https://sh.lianjia.com/ershoufang/' + 'pg' + str(i) + '/' url_to_file.dump_target_url(target_url + '\n') url_to_file.close_file() #从文档中读取url urls = open('url.txt', 'r').readlines() #print(urls) #从url中获取href并写入txt文件 header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/65.0' } for url in urls: url = url.strip('\n') url_soup = UrlParser(url, header).get_url_soup() s = UrlParser(url, header).get_url_href(url_soup) for item in s: href_to_txt = DataOutput(item).data_to_txt('href.txt') #从href.txt文件中读取href并解析 f = open('href.txt', 'r').readlines() for detail_href in f: i = f.index(detail_href) print('正在处理第{}个href'.format(i)) detail_url = detail_href.strip('\n') try: global detail detail = UrlParser(detail_href, header) detail_soup = detail.get_url_soup()
import requests from bs4 import BeautifulSoup from htmlparser import HtmlParser from urlparser import UrlParser from time import sleep import codecs import json import pandas as pd visited = set() unvisited = set() domain = 'www.motoji.co.jp' siteUrl = f"https://{domain}/" praser_url = UrlParser(siteUrl) parser_html = HtmlParser() DATA = [] def get_res(url): headers_pc = {'User-Agent': 'robot wpmake'} try: res = requests.get(url, headers=headers_pc, timeout=5.0, allow_redirects=False) return res except requests.exceptions.RequestException as e: print(e) return False def update_data(url, status_code): DATA.append({"url": url, "status_code": status_code})