def get_format_html(cls, html, final_url): """获得标准化的html""" final_protocol = UrlUtil.get_protocol(final_url) final_domain = UrlUtil.get_domain(final_url) format_html = HtmlUtil.parse_protocol(html, final_protocol) format_html = HtmlUtil.parse_domain(format_html, final_protocol, final_domain) return format_html
def download(self, main_item, after_scroll_time=1): """下载一个web页面""" if not isinstance(main_item, MainItem): logging.error("Received param must items.MainItem, but get " + str(type(main_item))) return None start_time = time.time() try: self.driver.get(main_item.request_url) # 请求页面 # todo 存储图片 screenshot_base64 = self.driver.get_screenshot_as_base64() except TimeoutException as e: logging.info("Get url:" + main_item.request_url + ", msg: " + e.msg) self.driver.execute_script("window.stop()") except WebDriverException as e: logging.error("When download page, error class: %s, message: %s." % (e.__class__, e.msg)) options = Options() options.add_argument('-headless') self.driver = Firefox(firefox_options=options) logging.info("Webdriver reinit") finally: load_time = time.time() - start_time logging.info("Get url:" + main_item.request_url + " spend " + str(load_time) + "s.") server_ip = socket.gethostbyname( UrlUtil.get_domain(main_item.request_url)) js_scroll = """ function go_down() { var h = document.documentElement.scrollHeight || document.body.scrollHeight; window.scroll(h, h); } go_down() """ # 翻页JS try: self.driver.execute_script(js_scroll) # 执行翻页 time.sleep(after_scroll_time) # 执行了翻页后等待页面加载nS except WebDriverException as e: logging.error( "When scroll page, error class: %s, error message: %s" % (e.__class__, e.msg)) current_url = None page_source = None try: current_url = self.driver.current_url page_source = self.driver.page_source except UnexpectedAlertPresentException as e: logging.info("点击弹出框") try: self.driver.switch_to.alert.accept() except NoAlertPresentException: # 把框点没了就会抛出这个异常,不知这什么鬼设计 pass try: current_url = self.driver.current_url page_source = self.driver.page_source except WebDriverException as e: logging.error(e.msg) except WebDriverException as e: logging.error(e.msg) finally: if not current_url: current_url = "Something error occurred, please check the error log." if not page_source: page_source = "Something error occurred, please check the error log." # 填充现有信息 main_item.final_url = current_url # download_item.screen_shot = screenshot_base64 main_item.load_time = load_time main_item.html = page_source time_array = time.localtime(int(time.time())) main_item.get_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) with open("/etc/internet-snapshot.conf", "r") as f: ext_conf = json.load(f) main_item.send_ip = ext_conf["ip"] main_item.server_ip = server_ip return main_item
def parse(self): if not isinstance(self.downloader_item, MainItem): logging.error("The param type is: " + str(type(self.downloader_item)) + ", but it should be MainItem.") return None html = self.downloader_item.html # 将downloader_item存库 with self.connection.cursor() as cursor: sql = 'INSERT INTO snapshot (request_url, final_url, load_time, refer, get_time,' \ ' task_id, send_ip, server_ip, deepth) VALUES (%s, %s, %s, %s, %s, %s, %s, ' \ '%s, %s);' result = cursor.execute(sql, self.downloader_item.save_tuple()) if result != 1: logging.error("snapshot插入记录" + self.downloader_item.save_tuple() + "失败!") # 拿到刚刚存库记录的id with self.connection.cursor() as cursor: sql = 'SELECT last_insert_id() as ss_id;' cursor.execute(sql) result = cursor.fetchone() ss_id = result["ss_id"] # 将页面内容存库 ss_html = SsHtmlItem(ss_id=ss_id, html=html) with self.connection.cursor() as cursor: sql = 'INSERT INTO ss_html (ss_id, html) VALUES (%s, %s);' result = cursor.execute(sql, ss_html.save_tuple()) if result != 1: logging.error("ss_html插入记录" + ss_html.save_tuple() + "失败!") # 规范化一下页面内链接 final_protocol = UrlUtil.get_protocol(self.downloader_item.final_url) final_domain = UrlUtil.get_domain(self.downloader_item.final_url) format_html = HtmlUtil.parse_protocol(html, final_protocol) format_html = HtmlUtil.parse_domain(format_html, final_protocol, final_domain) tree = etree.HTML(format_html) hrefs = tree.xpath("//@href") # 拿到所有a标签中的链接对象 iframes = tree.xpath("//iframe/@src") # 拿到所有iframe的源链接 jss = tree.xpath("//script/@src") # 拿到所有的js链接 hrefs.extend(iframes) hrefs.extend(jss) if hrefs: hrefs = href_clean(hrefs) else: hrefs = list() inner_chains = set() # 内链列表,返回给引擎迭代 unknown_domains = set() # 可疑外链主域名列表,存库人工复查 request_top_domain = UrlUtil.get_top_domain( self.downloader_item.request_url) for href in hrefs: this_top_domain = UrlUtil.get_top_domain(href) if request_top_domain == this_top_domain and UrlUtil.get_url_suffix( href) != "js": inner_chains.add(href) elif this_top_domain not in self.safe_chains and not UrlUtil.is_gov_or_edu( href): # 主域名不在白名单里而且不是政府或教育机构网站 unknown_domains.add(this_top_domain) # 将须迭代的内链包装对象放入redis logging.info("Length of inner_chains is " + str(len(inner_chains))) dup_set_name = "engine:dup_set:" + str(self.downloader_item.task_id) queue_name = "engine:queue:" + str(self.downloader_item.task_id) for inner_chain in inner_chains: if isinstance(self.redis_conn.ttl(dup_set_name), int): sadd_re = self.redis_conn.sadd(dup_set_name, inner_chain) if sadd_re == 1: # 等于1说明上条插入成功,没有重复,省了一次查重 new_main_item = MainItem( inner_chain, refer=self.downloader_item.final_url, task_id=self.downloader_item.task_id, deepth=self.downloader_item.deepth + 1) self.redis_conn.lpush( queue_name, json.dumps(new_main_item, default=main_item_to_json)) # 将可疑外链存库 for unknown_domain in unknown_domains: with self.connection.cursor() as cursor: sql = "SELECT mydomain FROM malicious_domains;" cursor.execute(sql) malicious_records = cursor.fetchall() malicious_domains = set([ malicious_record["mydomain"] for malicious_record in malicious_records ]) if unknown_domain in malicious_domains: suspicious_item = SuspiciousItem( ss_id, unknown_domain, 1, 1, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) else: suspicious_item = SuspiciousItem(ss_id, unknown_domain, 0, -1, None) with self.connection.cursor() as cursor: sql = 'INSERT INTO suspicious_records (ss_id, unknown_domain, checked, result, ' \ 'check_time) VALUES (%s, %s, %s, %s, %s)' result = cursor.execute(sql, suspicious_item.save_tuple()) if result != 1: logging.error("suspicious_records插入记录" + suspicious_item.save_tuple() + "失败!") self.connection.commit() self.connection.close() logging.info(self.downloader_item.request_url + " parse over.")