class MySpider(object): def __init__(self, root_url): self.parser = HtmlParser() self.storage = DataStore() self._get_root_urls(root_url) def _get_root_urls(self, root_url): if os.path.exists('job_class.json'): pass else: new_urls = self.parser.get_url(root_url) self.storage.local_store(new_urls, 'job_class.json') #存储要爬取的行业类别url def joburl_init(self, pagenum, path='job_class.json'): root_urls = self.storage.load_data(path) jobs_dict = {} for i in pagenum: for list in root_urls: jobs_dict[list + str(i)] = root_urls[list] + str(i) #构造要爬取的网址链接 self.storage.local_store(jobs_dict, 'job_page_url.json') #存储构造好的网址链接 def company_url(self, path='job_page_url.json'): company_urls = self.storage.load_data(path) company_dicts = {} url_get = 0 #已获取的网址总数 for company_info_url in company_urls: print("待爬取的行业网址总数:", len(company_urls) - url_get) url_get += 1 url = company_urls[company_info_url] company_dicts.update(self.parser.getcompany_url(url)) self.storage.local_store(url, 'job_page_url_old.json') #存储已爬取的网址 self.storage.local_store(company_dicts, 'company_info_url_new.json') #存储公司信息的URL def company_info(self, path='company_info_url_new.json'): company_info_urls = self.storage.load_data(path) url_get = 0 #以获取的公司信息网址总数 for company_name in company_info_urls: print("待爬取的公司信息网址总数:", len(company_info_urls) - url_get) url_get += 1 url = company_info_urls[company_name] self.parser.getcompany_info(company_name, url) self.storage.local_store( url, 'compang_info_url_old.json') #存储以爬取的存储公司信息URL #从上次断点出重新开始获取公司信息 def grab_increment(self): new_urls = self.storage.load_data('company_info_url_new.json') old_urls = self.storage.load_data('compang_info_url_old.json') for company_name in new_urls: new_url = new_urls[company_name] if new_url not in old_urls: self.parser.getcompany_info(company_name, url) self.storage.local_store( url, 'compang_info_url_old.json') # 存储以爬取的存储公司信息URL
class Spider: def __init__(self, url, number_of_threads=20, allowed_urls=[], blocked_urls=[], basic_auth=(), depth=-1): self.url = url self.number_of_threads = number_of_threads self.allowed_urls = allowed_urls # self.blocked_urls = blocked_urls self.lost_url = set() self.basic_auth = basic_auth self.depth = depth self.crawl = True self.visited = {} self.general_visited = set() self.unvisited = set() self.general_unvisited = {self.url} self.fetched_url_record = dict() self.csv_table = CsvFormat([ "url", "status code", "title", "keyword", "description", "h1", "h2", "h3", "h4", "h5", "h6", "index", "open tags", "external links", "h_tag_format" ]) self.downloaded_pages = {} self.record = [] self.url_parser = UrlParser(url) self.parser = HtmlParser() self.filemanager = FileManager() def start(self): self.fetch_html() while len(self.general_visited) < len( self.general_unvisited) and self.crawl == True: self.fetch_html() def fetch_html(self): url = self.get_url() if url in self.general_visited or not url: return res = self.get_html(url) if res.status_code >= 500: self.add_to_visited(url, 500) return False elif res.status_code >= 400: self.save_formated_data(res, url) self.add_to_visited(url, 400) elif res.status_code >= 300: if res.history: if self.url_parser.domain not in res.url: return False elif res.status_code >= 200: self.save_formated_data(res, url) self.add_to_visited(url, 200) def save_formated_data(self, response, current_url): html = BeautifulSoup(response.content, "lxml") self.csv_table.create_row('data') h_tags = self.parser.get_all_h(html) update = { "url": current_url, "status code": response.status_code, "title": self.parser.get_title(html), "keyword": self.parser.get_meta_keyword(html), "description": self.parser.get_meta_description(html), "h1": self.parser.get_htag("h1", h_tags), "h2": self.parser.get_htag("h2", h_tags), "h3": self.parser.get_htag("h3", h_tags), "h4": self.parser.get_htag("h4", h_tags), "h5": self.parser.get_htag("h5", h_tags), "h6": self.parser.get_htag("h6", h_tags), "index": self.parser.get_meta_index(html), "open tags": self.find_open_tags(response.text), "external links": self.parser.get_broken_a_tags(response.text, self.url_parser.domain, current_url), "h_tag_format": self.parser.tag_structure(response.text), } if response.status_code >= 400: update["status code"] = str(update["status code"]) for fetched_page_url, fetched_url_list in self.fetched_url_record.items( ): if current_url in fetched_url_list: update["status code"] += f" {fetched_page_url}にあります、\n" self.csv_table.update_row('data', update) self.csv_table.add_row_to_table('data') fetched_urls = self.parser.get_url(html, self.url_parser.domain, current_url) self.add_to_unvisited(current_url, fetched_urls) def get_url(self): if not self.unvisited: self.unvisited = self.general_unvisited - self.general_visited return self.unvisited.pop() return self.unvisited.pop() def add_to_visited(self, key, *args): if key not in self.visited and args: self.visited[key] = list(args) self.general_visited.add(key) def add_to_unvisited(self, url, fetched_urls): self.fetched_url_record[url] = fetched_urls self.general_unvisited.update(fetched_urls) def find_open_tags(self, html): open_tag_finder = OpenTagFinder() open_tag_finder.feed(html) open_tag_finder.reset() open_tags = open_tag_finder.get_open_tags() return open_tags def get_html(self, url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95' } try: if self.basic_auth: return requests.get(url, headers=headers, auth=HTTPBasicAuth(self.basic_auth[0], self.basic_auth[1]), timeout=5.0) else: return requests.get(url, headers=headers, timeout=80.0) except requests.exceptions.RequestException as e: print(e) self.filemanager.save_to_log(f"{e} in url {url}") return