コード例 #1
0
ファイル: htmlparser.py プロジェクト: Honda-a/seotool
    def get_url(self, html, domain):
        url_parser = UrlParser(domain)
        netloc = url_parser.get_netloc(domain)
        urls = set()
        try:
            for bs_object in html.find_all(["a", "img"]):
                raw_url = ""
                if "href" in bs_object.attrs:
                    raw_url = bs_object.attrs["href"]
                elif "src" in bs_object.attrs:
                    raw_url = bs_object.attrs["src"]
                else:
                    continue

                if not url_parser.is_internal(raw_url, url_parser.domain):
                    continue
                if not url_parser.pretty_url(raw_url, url_parser.domain):
                    continue
                if url_parser.pretty_url(raw_url,
                                         url_parser.domain).count(netloc) > 1:
                    continue
                if "tel:" in raw_url.lower():
                    continue
                if "mailto:" in raw_url.lower():
                    continue
                url = url_parser.pretty_url(raw_url, url_parser.domain)
                urls.add(url)
        except Exception as e:
            self.filemanager.save_to_log(e)
        return urls
コード例 #2
0
ファイル: spider.py プロジェクト: Honda-a/seotool
 def __init__(self,
              url,
              number_of_threads=20,
              allowed_urls=[],
              blocked_urls=[],
              basic_auth=(),
              depth=-1):
     self.url = url
     self.number_of_threads = number_of_threads
     self.allowed_urls = allowed_urls
     # self.blocked_urls = blocked_urls
     self.lost_url = set()
     self.basic_auth = basic_auth
     self.depth = depth
     self.crawl = True
     self.visited = {}
     self.general_visited = set()
     self.unvisited = set()
     self.general_unvisited = {self.url}
     self.fetched_url_record = dict()
     self.csv_table = CsvFormat([
         "url", "status code", "title", "keyword", "description", "h1",
         "h2", "h3", "h4", "h5", "h6", "index", "open tags",
         "external links", "h_tag_format"
     ])
     self.downloaded_pages = {}
     self.record = []
     self.url_parser = UrlParser(url)
     self.parser = HtmlParser()
     self.filemanager = FileManager()
コード例 #3
0
ファイル: htmlparser.py プロジェクト: Honda-a/seotool
    def get_broken_a_tags(self, html, domain, current_url):
        html_soup = BeautifulSoup(html, "lxml")
        url = UrlParser(domain)
        rel = ""
        urls = ""
        for bs_object in html_soup.find_all("a"):
            if "rel" in bs_object.attrs:
                rel = bs_object["rel"]
            else:
                rel = "rel 属性はありません"
            if not ("href" in bs_object.attrs):
                continue
            line = self.find_line(html, bs_object["href"])
            if url.is_external(bs_object["href"], domain):
                urls = urls + f"{line}行目、 外部 url: {bs_object['href']} rel属性: {rel}" + "\n"
                continue

        return urls
コード例 #4
0
    else:
        target_url = 'https://sh.lianjia.com/ershoufang/' + 'pg' + str(i) + '/'
    url_to_file.dump_target_url(target_url + '\n')
url_to_file.close_file()

#从文档中读取url
urls = open('url.txt', 'r').readlines()
#print(urls)

#从url中获取href并写入txt文件
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/65.0'
}
for url in urls:
    url = url.strip('\n')
    url_soup = UrlParser(url, header).get_url_soup()
    s = UrlParser(url, header).get_url_href(url_soup)
    for item in s:
        href_to_txt = DataOutput(item).data_to_txt('href.txt')

#从href.txt文件中读取href并解析
f = open('href.txt', 'r').readlines()

for detail_href in f:
    i = f.index(detail_href)
    print('正在处理第{}个href'.format(i))
    detail_url = detail_href.strip('\n')
    try:
        global detail
        detail = UrlParser(detail_href, header)
        detail_soup = detail.get_url_soup()
コード例 #5
0
ファイル: test.py プロジェクト: Honda-a/seotool
import requests
from bs4 import BeautifulSoup
from htmlparser import HtmlParser
from urlparser import UrlParser
from time import sleep
import codecs
import json
import pandas as pd

visited = set()
unvisited = set()
domain = 'www.motoji.co.jp'
siteUrl = f"https://{domain}/"
praser_url = UrlParser(siteUrl)
parser_html = HtmlParser()
DATA = []

def get_res(url):

    headers_pc = {'User-Agent': 'robot wpmake'}
    try:
        res = requests.get(url, headers=headers_pc, timeout=5.0, allow_redirects=False)
        return res
    except requests.exceptions.RequestException as e:
        print(e)
        return False

def update_data(url, status_code):

    DATA.append({"url": url, "status_code": status_code})