コード例 #1
0
class MySpider(object):
    def __init__(self, root_url):
        self.parser = HtmlParser()
        self.storage = DataStore()
        self._get_root_urls(root_url)

    def _get_root_urls(self, root_url):
        if os.path.exists('job_class.json'):
            pass
        else:
            new_urls = self.parser.get_url(root_url)
            self.storage.local_store(new_urls,
                                     'job_class.json')  #存储要爬取的行业类别url

    def joburl_init(self, pagenum, path='job_class.json'):
        root_urls = self.storage.load_data(path)
        jobs_dict = {}
        for i in pagenum:
            for list in root_urls:
                jobs_dict[list +
                          str(i)] = root_urls[list] + str(i)  #构造要爬取的网址链接
        self.storage.local_store(jobs_dict, 'job_page_url.json')  #存储构造好的网址链接

    def company_url(self, path='job_page_url.json'):
        company_urls = self.storage.load_data(path)
        company_dicts = {}
        url_get = 0  #已获取的网址总数
        for company_info_url in company_urls:
            print("待爬取的行业网址总数:", len(company_urls) - url_get)
            url_get += 1
            url = company_urls[company_info_url]
            company_dicts.update(self.parser.getcompany_url(url))
            self.storage.local_store(url, 'job_page_url_old.json')  #存储已爬取的网址
        self.storage.local_store(company_dicts,
                                 'company_info_url_new.json')  #存储公司信息的URL

    def company_info(self, path='company_info_url_new.json'):
        company_info_urls = self.storage.load_data(path)
        url_get = 0  #以获取的公司信息网址总数
        for company_name in company_info_urls:
            print("待爬取的公司信息网址总数:", len(company_info_urls) - url_get)
            url_get += 1
            url = company_info_urls[company_name]
            self.parser.getcompany_info(company_name, url)
            self.storage.local_store(
                url, 'compang_info_url_old.json')  #存储以爬取的存储公司信息URL

    #从上次断点出重新开始获取公司信息
    def grab_increment(self):
        new_urls = self.storage.load_data('company_info_url_new.json')
        old_urls = self.storage.load_data('compang_info_url_old.json')
        for company_name in new_urls:
            new_url = new_urls[company_name]
            if new_url not in old_urls:
                self.parser.getcompany_info(company_name, url)
                self.storage.local_store(
                    url, 'compang_info_url_old.json')  # 存储以爬取的存储公司信息URL
コード例 #2
0
ファイル: spider.py プロジェクト: Honda-a/seotool
class Spider:
    def __init__(self,
                 url,
                 number_of_threads=20,
                 allowed_urls=[],
                 blocked_urls=[],
                 basic_auth=(),
                 depth=-1):
        self.url = url
        self.number_of_threads = number_of_threads
        self.allowed_urls = allowed_urls
        # self.blocked_urls = blocked_urls
        self.lost_url = set()
        self.basic_auth = basic_auth
        self.depth = depth
        self.crawl = True
        self.visited = {}
        self.general_visited = set()
        self.unvisited = set()
        self.general_unvisited = {self.url}
        self.fetched_url_record = dict()
        self.csv_table = CsvFormat([
            "url", "status code", "title", "keyword", "description", "h1",
            "h2", "h3", "h4", "h5", "h6", "index", "open tags",
            "external links", "h_tag_format"
        ])
        self.downloaded_pages = {}
        self.record = []
        self.url_parser = UrlParser(url)
        self.parser = HtmlParser()
        self.filemanager = FileManager()

    def start(self):
        self.fetch_html()
        while len(self.general_visited) < len(
                self.general_unvisited) and self.crawl == True:
            self.fetch_html()

    def fetch_html(self):
        url = self.get_url()
        if url in self.general_visited or not url:
            return
        res = self.get_html(url)
        if res.status_code >= 500:
            self.add_to_visited(url, 500)
            return False
        elif res.status_code >= 400:
            self.save_formated_data(res, url)
            self.add_to_visited(url, 400)
        elif res.status_code >= 300:
            if res.history:
                if self.url_parser.domain not in res.url:
                    return False
        elif res.status_code >= 200:
            self.save_formated_data(res, url)
            self.add_to_visited(url, 200)

    def save_formated_data(self, response, current_url):
        html = BeautifulSoup(response.content, "lxml")
        self.csv_table.create_row('data')
        h_tags = self.parser.get_all_h(html)
        update = {
            "url":
            current_url,
            "status code":
            response.status_code,
            "title":
            self.parser.get_title(html),
            "keyword":
            self.parser.get_meta_keyword(html),
            "description":
            self.parser.get_meta_description(html),
            "h1":
            self.parser.get_htag("h1", h_tags),
            "h2":
            self.parser.get_htag("h2", h_tags),
            "h3":
            self.parser.get_htag("h3", h_tags),
            "h4":
            self.parser.get_htag("h4", h_tags),
            "h5":
            self.parser.get_htag("h5", h_tags),
            "h6":
            self.parser.get_htag("h6", h_tags),
            "index":
            self.parser.get_meta_index(html),
            "open tags":
            self.find_open_tags(response.text),
            "external links":
            self.parser.get_broken_a_tags(response.text,
                                          self.url_parser.domain, current_url),
            "h_tag_format":
            self.parser.tag_structure(response.text),
        }
        if response.status_code >= 400:
            update["status code"] = str(update["status code"])
            for fetched_page_url, fetched_url_list in self.fetched_url_record.items(
            ):
                if current_url in fetched_url_list:
                    update["status code"] += f" {fetched_page_url}にあります、\n"
        self.csv_table.update_row('data', update)
        self.csv_table.add_row_to_table('data')
        fetched_urls = self.parser.get_url(html, self.url_parser.domain,
                                           current_url)
        self.add_to_unvisited(current_url, fetched_urls)

    def get_url(self):
        if not self.unvisited:
            self.unvisited = self.general_unvisited - self.general_visited
            return self.unvisited.pop()

        return self.unvisited.pop()

    def add_to_visited(self, key, *args):
        if key not in self.visited and args:
            self.visited[key] = list(args)
        self.general_visited.add(key)

    def add_to_unvisited(self, url, fetched_urls):
        self.fetched_url_record[url] = fetched_urls
        self.general_unvisited.update(fetched_urls)

    def find_open_tags(self, html):
        open_tag_finder = OpenTagFinder()
        open_tag_finder.feed(html)
        open_tag_finder.reset()
        open_tags = open_tag_finder.get_open_tags()
        return open_tags

    def get_html(self, url):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95'
        }
        try:
            if self.basic_auth:
                return requests.get(url,
                                    headers=headers,
                                    auth=HTTPBasicAuth(self.basic_auth[0],
                                                       self.basic_auth[1]),
                                    timeout=5.0)
            else:
                return requests.get(url, headers=headers, timeout=80.0)
        except requests.exceptions.RequestException as e:
            print(e)
            self.filemanager.save_to_log(f"{e} in url {url}")
            return