def get_url(self, html, domain): url_parser = UrlParser(domain) netloc = url_parser.get_netloc(domain) urls = set() try: for bs_object in html.find_all(["a", "img"]): raw_url = "" if "href" in bs_object.attrs: raw_url = bs_object.attrs["href"] elif "src" in bs_object.attrs: raw_url = bs_object.attrs["src"] else: continue if not url_parser.is_internal(raw_url, url_parser.domain): continue if not url_parser.pretty_url(raw_url, url_parser.domain): continue if url_parser.pretty_url(raw_url, url_parser.domain).count(netloc) > 1: continue if "tel:" in raw_url.lower(): continue if "mailto:" in raw_url.lower(): continue url = url_parser.pretty_url(raw_url, url_parser.domain) urls.add(url) except Exception as e: self.filemanager.save_to_log(e) return urls
def __init__(self, url, number_of_threads=20, allowed_urls=[], blocked_urls=[], basic_auth=(), depth=-1): self.url = url self.number_of_threads = number_of_threads self.allowed_urls = allowed_urls # self.blocked_urls = blocked_urls self.lost_url = set() self.basic_auth = basic_auth self.depth = depth self.crawl = True self.visited = {} self.general_visited = set() self.unvisited = set() self.general_unvisited = {self.url} self.fetched_url_record = dict() self.csv_table = CsvFormat([ "url", "status code", "title", "keyword", "description", "h1", "h2", "h3", "h4", "h5", "h6", "index", "open tags", "external links", "h_tag_format" ]) self.downloaded_pages = {} self.record = [] self.url_parser = UrlParser(url) self.parser = HtmlParser() self.filemanager = FileManager()
def get_broken_a_tags(self, html, domain, current_url): html_soup = BeautifulSoup(html, "lxml") url = UrlParser(domain) rel = "" urls = "" for bs_object in html_soup.find_all("a"): if "rel" in bs_object.attrs: rel = bs_object["rel"] else: rel = "rel 属性はありません" if not ("href" in bs_object.attrs): continue line = self.find_line(html, bs_object["href"]) if url.is_external(bs_object["href"], domain): urls = urls + f"{line}行目、 外部 url: {bs_object['href']} rel属性: {rel}" + "\n" continue return urls
else: target_url = 'https://sh.lianjia.com/ershoufang/' + 'pg' + str(i) + '/' url_to_file.dump_target_url(target_url + '\n') url_to_file.close_file() #从文档中读取url urls = open('url.txt', 'r').readlines() #print(urls) #从url中获取href并写入txt文件 header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/65.0' } for url in urls: url = url.strip('\n') url_soup = UrlParser(url, header).get_url_soup() s = UrlParser(url, header).get_url_href(url_soup) for item in s: href_to_txt = DataOutput(item).data_to_txt('href.txt') #从href.txt文件中读取href并解析 f = open('href.txt', 'r').readlines() for detail_href in f: i = f.index(detail_href) print('正在处理第{}个href'.format(i)) detail_url = detail_href.strip('\n') try: global detail detail = UrlParser(detail_href, header) detail_soup = detail.get_url_soup()
import requests from bs4 import BeautifulSoup from htmlparser import HtmlParser from urlparser import UrlParser from time import sleep import codecs import json import pandas as pd visited = set() unvisited = set() domain = 'www.motoji.co.jp' siteUrl = f"https://{domain}/" praser_url = UrlParser(siteUrl) parser_html = HtmlParser() DATA = [] def get_res(url): headers_pc = {'User-Agent': 'robot wpmake'} try: res = requests.get(url, headers=headers_pc, timeout=5.0, allow_redirects=False) return res except requests.exceptions.RequestException as e: print(e) return False def update_data(url, status_code): DATA.append({"url": url, "status_code": status_code})