def check_js(self, doc: BeautifulSoup): elements = doc.select('script') for tag in elements: path = tag.get('src') if path is None: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = 'script 標籤內沒有設定 src tag: {}'.format(tag) error_data = et.get_error_data(17, msg, 1, reduct_point) self.errors.append(error_data) continue else: regex_check = re.findall( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', path) if regex_check != 0: continue else: check_list = path.split('/') if 'js' not in check_list: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = 'javascript 未寫在 js 資料夾中。 path: {}'.format(path) error_data = et.get_error_data(17, msg, 1, reduct_point) self.errors.append(error_data)
def check_img_display(self, url: str, doc: bs4.BeautifulSoup): img_list = doc.find_all('img') for img in img_list: path = img.get('src') # path = path.replace('data:', '') # $$ FIND-AT: 2020/06/19 abs_url = urljoin(url, path) try: res = requests.get(abs_url) except Exception as e: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '圖片完全無法讀取。 img_path: {}'.format(path) error_data = et.get_error_data(10, msg, 1, reduct_point) self.errors.append(error_data) continue status = res.status_code if status != 200 and status != 403: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '圖片顯示狀態錯誤。 status: {} , img-path: {}'.format( status, path) error_data = et.get_error_data(10, msg, 1, reduct_point) self.errors.append(error_data)
def check_doc_type(self): if '<!DOCTYPE html>' not in self.html: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '<!DOCTYPE> 標籤錯誤' error_data = et.get_error_data(2, msg, 1, reduct_point) self.errors.append(error_data)
def check_scroll_bar(self, browser: webdriver): width = browser.execute_script("return document.body.scrollWidth") if width > 1280: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '網頁出現橫向卷軸。 網頁寬度: {}'.format(width) error_data = et.get_error_data(8, msg, 1, reduct_point) self.errors.append(error_data)
def check_title(self, doc: BeautifulSoup): tag = doc.select('head title') if len(tag) == 0: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = 'title 標籤未建立' error_data = et.get_error_data(5, msg, 1, reduct_point) self.errors.append(error_data) return title = tag[0].text if len(title) == 0 or title is None: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = 'title 標籤錯誤' error_data = et.get_error_data(5, msg, 1, reduct_point) self.errors.append(error_data)
def check_ul_tag(self, doc: BeautifulSoup): wrong_elements = [] self.get_wrong_ul(doc, wrong_elements) for el in wrong_elements: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = 'ul tag 使用錯誤, 錯誤元素: {} , 當前標籤: {} , 父標籤: {}'.format(str(el).replace('\n', ''), el.name, el.parent.name) error_data = et.get_error_data(24, msg, 1, reduct_point) self.errors.append(error_data)
def check_access(self) -> bool: try: self.res = requests.get(self.url) return True except Exception as e: self.result = False self.minus += 100 msg = '無法讀取作業網頁。 url: {} -> error: {}'.format(self.url, e) error_data = et.get_error_data(1, msg, 1, 100) self.errors.append(error_data) return False
def check_response_status(self) -> bool: status = self.res.status_code if status != 200: self.result = False self.minus += 100 msg = '網站不存在,或路徑錯誤。 status: {} , url: {}'.format(status, self.url) error_data = et.get_error_data(1, msg, 1, 100) self.errors.append(error_data) return False else: return True
def check_img_setting(self, url: str, doc: BeautifulSoup): elements = doc.select('body img') for tag in elements: path = tag.get('src') abs_url = urljoin(url, path) height = tag.get('height') width = tag.get('width') if height is not None and re.match(r'[0-9]+%', str(width)): if re.match(r'[0-9]+\s*px', str(height)) and re.match( r'[0-9]+\s*px', str(width)): h = int(height.replace('px', '')) w = int(width.replace('px', '')) hwp = round(h / w, 2) raw_hwp = self.get_img_hwp(abs_url) if hwp != raw_hwp: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '圖片設定後的寬高比例與原圖比例不同。 tag: {}'.format(tag) error_data = et.get_error_data(18, msg, 1, reduct_point) self.errors.append(error_data) else: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '圖片設定錯誤。 tag: {}'.format(tag) error_data = et.get_error_data(19, msg, 1, reduct_point) self.errors.append(error_data)
def check_css(self, doc: BeautifulSoup): elements = doc.select('link[rel = "stylesheet"][type = "text/css"]') for tag in elements: path = tag.get('href') check_list = path.split('/') if 'css' not in check_list: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = 'css未放入css資料夾中。 path: {}'.format(path) error_data = et.get_error_data(16, msg, 1, reduct_point) self.errors.append(error_data)
def check_li_tag(self, doc: BeautifulSoup): li_list = doc.select('li') for li in li_list: parent_tag = li.parent.name if parent_tag not in ['ul', 'ol']: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = 'li標籤單獨使用, 錯誤元素: {} , 父標籤: {}'.format( str(li).replace('\n', ''), parent_tag) error_data = et.get_error_data(26, msg, 1, reduct_point) self.errors.append(error_data)
def check_lang(self, doc: BeautifulSoup): tag = doc.select('html[lang]') if len(tag) == 0: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = 'lang 屬性未建立' error_data = et.get_error_data(4, msg, 1, reduct_point) self.errors.append(error_data) return lang = tag[0].get('lang') if lang not in ['zh-TW', 'zh-tw', 'ZH-TW']: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = 'lang 屬性錯誤。 tag: {}'.format(lang) error_data = et.get_error_data(4, msg, 1, reduct_point) self.errors.append(error_data)
def check_charset(self, doc: BeautifulSoup): tag = doc.select('meta[charset]') if len(tag) == 0: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = 'charset 標籤未建立' error_data = et.get_error_data(3, msg, 1, reduct_point) self.errors.append(error_data) return charset = tag[0].get('charset') if charset not in ['utf-8', 'utf8', 'UTF8', 'UTF-8']: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = 'charset 標籤錯誤。 tag: {}'.format(charset) error_data = et.get_error_data(3, msg, 1, reduct_point) self.errors.append(error_data)
def content_is_empty(self) -> bool: text = self.doc.text.strip().replace(' ', '') text_count = len(text) if text_count < self.text_low_limit: self.result = False self.minus += 100 msg = '網頁內容趨近空白 url: {} , 字數: {}'.format(self.url, text_count) error_data = et.get_error_data(1, msg, 1, 100) self.errors.append(error_data) return False else: return True
def check_html_parse(self) -> bool: self.res.encoding = 'utf-8' self.html = self.res.text try: self.doc = BeautifulSoup(self.html, 'html.parser') return True except Exception as e: self.result = False self.minus += 100 msg = '作業網頁無法解析。 url: {} -> error: {}'.format(self.url, e) error_data = et.get_error_data(1, msg, 1, 100) self.errors.append(error_data) return False
def check_gl_symbols(self, html: str, doc: BeautifulSoup): temp_html = PublicTool.escape_content_gls(html, doc) ls_count = temp_html.count('<') gs_count = temp_html.count('>') if ls_count != gs_count: dev = abs(ls_count - gs_count) self.result = False reduct_point = self.POINT * dev self.minus += reduct_point msg = '<、> 符號數量錯誤。 相差: {}'.format(dev) error_data = et.get_error_data(21, msg, 1, reduct_point) self.errors.append(error_data)
def check_file_path(self, url): check_list = url.split('/') if len(check_list) == 0: return file_name = check_list[-1] regex_check = re.findall(r'([\u4E00-\u9FFF]+|[A-Z]+|\s+)', file_name) if len(regex_check) != 0: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '網頁檔名或是含有 "中文" 或 "大寫字母" 或 "空白字元" 。 path_string: {}'.format( file_name) error_data = et.get_error_data(11, msg, 1, reduct_point) self.errors.append(error_data)
def check_head_body(self, doc: BeautifulSoup): tag_list = doc.select('head')[0].children for tag in tag_list: if type(tag) is not bs4.element.Tag: continue if tag.name not in [ 'base', 'link', 'meta', 'script', 'style', 'title' ]: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '網頁內容不在 <body></body> 區間內' error_data = et.get_error_data(7, msg, 1, reduct_point) self.errors.append(error_data)
def check_tags(self, html: str, doc: BeautifulSoup): self_closing = [ 'area', 'base', 'br', 'embed', 'hr', 'iframe', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'address', 'aside' ] tag_list = [tag.name for tag in doc.find_all()] tag_list = list(set(tag_list)) temp_html = PublicTool.escape_content_gls(html, doc) for tag in tag_list: if tag in self_closing: continue # pattern = r'<{1}' + tag + ' ' + '[^<,>]*>{1}' # pattern = r'^(<{1}' + tag + ' ' + ')(.*)(>{1})$' pattern = r'<' + tag + ' ' normal_start = '<' + tag + '>' normal_end = '</' + tag + '>' sp_case_list = re.findall(pattern, temp_html) start_count = len(sp_case_list) + temp_html.count(normal_start) end_count = temp_html.count(normal_end) if start_count != end_count: # print(tag, start_count, end_count) dev = abs(start_count - end_count) if start_count > end_count: info = normal_start + ' 多於 ' + normal_end else: info = normal_end + ' 多於 ' + normal_start self.result = False reduct_point = self.POINT * dev self.minus += reduct_point msg = 'tag 的開始與結束有錯誤。 {} 相差數: {}'.format(info, dev) error_data = et.get_error_data(20, msg, dev, reduct_point) self.errors.append(error_data)
def check_attr_space(self, html: str, doc: BeautifulSoup): tag_list = [tag.name for tag in doc.find_all()] tag_list = list(set(tag_list)) hp = r'href=\"([^"]*)\"' srcp = r'src=\"([^"]*)\"' for tag in tag_list: pattern = r'<{1}' + tag + ' [^<,>]*>{1}' find_list = re.findall(pattern, html) for check_tag in find_list: temp_tag = check_tag if bool(re.findall(hp, temp_tag)): replace_p = r' href=\"([^"]*)\"' temp_tag = re.sub(replace_p, '', temp_tag) if bool(re.findall(srcp, temp_tag)): replace_p = r' src=\"([^"]*)\"' temp_tag = re.sub(replace_p, '', temp_tag) req_p = r'\"([^"]*)\"' re_list = re.findall(req_p, temp_tag) for re_str in re_list: re_str = re_str.replace('"', '') temp_tag = temp_tag.replace(re_str, '') class_num = temp_tag.count('=') space_num = temp_tag.count(' ') if space_num < class_num: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '元素屬性之間沒有用空白隔開 tag: {}'.format(check_tag) error_data = et.get_error_data(22, msg, 1, reduct_point) self.errors.append(error_data)
def check_window_open(self, url: str, doc: BeautifulSoup): elements = doc.select('body [href]') for tag in elements: link = tag.get('href') if 'tel:' in link and '+' in link: continue if 'mailto:' in link and '@' in link: continue abs_url = urljoin(url, link) if PublicTool.is_external_domain( url, abs_url) and tag.get('target') != '_blank': self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '連結到外部連結沒有新開視窗。 tag: {}'.format(tag) error_data = et.get_error_data(14, msg, 1, reduct_point) self.errors.append(error_data)
def check_attr_quote(self, html: str, doc: BeautifulSoup): temp_html = PublicTool.escape_content_gls(html, doc) tags_with_attr = AttrQuoteCheck.get_tags_with_attr(temp_html, doc) for raw_str in tags_with_attr: soup = BeautifulSoup(raw_str, 'html.parser') for el in soup(): all_attrs = list(el.attrs.keys()) equal_symbol_count = 0 for attr_name in all_attrs: equal_symbol_count += raw_str.count('{}'.format(attr_name)) # edit at 2020/06/23 quote_count = raw_str.count('"') if quote_count != equal_symbol_count * 2: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '屬性的 " " 符號沒有正確使用。 錯誤元素: {}'.format(raw_str) error_data = et.get_error_data(23, msg, 1, reduct_point) self.errors.append(error_data)
def check_all_src_name(self, url: str, doc: BeautifulSoup): # file part elements = doc.select('[href]') for tag in elements: link = tag.get('href') if 'tel:' in link and '+' in link: continue if 'mailto:' in link and '@' in link: continue abs_url = urljoin(url, link) if not PublicTool.is_external_domain(url, abs_url): # file path_list = link.split('/') file = path_list.pop() file_check = re.findall(r'([\u4E00-\u9FFF]+|[A-Z]+|\s+)', file) if len(file_check) != 0: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '上傳的檔案中,有的名稱含有 中文 或 大寫 或 空白字元。 file name: {}'.format(file) error_data = et.get_error_data(13, msg, 1, reduct_point) self.errors.append(error_data) # folder for folder in path_list: folder_check = re.findall(r'([\u4E00-\u9FFF]+|[A-Z]+|\s+)', folder) if len(folder_check) != 0: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '上傳的資料夾中,有的名稱含有 中文 or 大寫 or 空白字元。 folder name: {}'.format(folder) error_data = et.get_error_data(13, msg, 1, reduct_point) self.errors.append(error_data) # img part img_list = doc.find_all('img') for img in img_list: path = img.get('src') abs_url = urljoin(self.url, path) if PublicTool.is_external_domain(url, abs_url): continue else: check_list = path.split('/') # 是否在圖片資料夾 if 'images' not in check_list and 'img' not in check_list and 'image' not in check_list: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '圖片未放在圖片資料夾(images or img or image) 或是路徑設定錯誤。 tag: {}'.format(path) error_data = et.get_error_data(15, msg, 1, reduct_point) self.errors.append(error_data) # 查看檔名 for sub_path in check_list: regex_check = re.findall(r'([\u4E00-\u9FFF]+|[A-Z]+|\s+)', sub_path) if len(regex_check) != 0: self.result = False reduct_point = self.POINT * 1 self.minus += reduct_point msg = '圖片檔名或是路徑含有中文/大寫/空白字元。 path_string: {}'.format(sub_path) error_data = et.get_error_data(12, msg, 1, reduct_point) self.errors.append(error_data)