def _work(self, entry_url): try: logger.info("[{}] req = > {}".format(len(self.done_url_list), entry_url)) if utils.url_ext(entry_url) in self.ignore_ext: return URLsimilarList() conn = utils.http_req(entry_url) if conn.status_code in [301, 302, 307]: _url = urljoin(entry_url, conn.headers.get("Location", "")).strip() _url = utils.normal_url(_url) if _url is None: return URLsimilarList() url_info = URLinfo(entry_url, _url, URLTYPE.document) if utils.same_netloc(entry_url, _url) and (url_info not in self.done_url_list): entry_url = _url logger.info("[{}] req 302 = > {}".format( len(self.done_url_list), entry_url)) conn = utils.http_req(_url) self.done_url_list.add(url_info) self.all_url_list.add(url_info) html = conn.content if "html" not in conn.headers.get("Content-Type", "").lower(): return URLsimilarList() dom = pq(html) ret_url = URLsimilarList() for tag in self.tagMap: items = dom(tag['name']).items() for i in items: _url = urljoin(entry_url, i.attr(tag['attr'])).strip() _url = utils.normal_url(_url) if _url is None: continue _type = tag["type"] if utils.same_netloc(_url, entry_url): url_info = URLinfo(entry_url, _url, _type) ret_url.add(url_info) self.all_url_list.add(url_info) return ret_url except Exception as e: logger.error("error on {} {}".format(entry_url, e)) return URLsimilarList()
def run(self): cnt = 0 for site in self.sites: domain = utils.get_hostname(site).split(":")[0] if domain not in self.domain_map_site: self.domain_map_site[domain] = [site] else: self.domain_map_site[domain].append(site) cnt += 1 if domain not in self.domain_map_url: logger.info("[{}/{}] start SearchEngines work on {}".format( cnt, len(self.sites), site)) urls = self.work(domain) logger.info("found url {}, by {}".format(len(urls), domain)) self.domain_map_url[domain] = urls for site in self.sites: domain = utils.get_hostname(site).split(":")[0] urls = self.domain_map_url.get(domain) for url in urls: if utils.same_netloc(site, url): if urlparse(url).path == "/" or (not urlparse(url).path): continue if site not in self.site_map_url: self.site_map_url[site] = [url] else: self.site_map_url[site].append(url) return self.site_map_url