def get_content_soup_new(soup, url): if not soup: record_error(url, "content container") return False # remove comment and map from soup if soup: for (name, kwargs) in settings["GET_NEW_CONTENT_IGNORE"]: for s in soup.find_all(name, **kwargs): s.extract() content = soup.find('div', class_="right") text_list = [] if content is not None and url.find("form") == -1: if content.get_text() == "" or content.get_text().isspace(): content = soup.find('div', id="content") if content is None or content.find('div', id="content"): content = soup.find('div', id="content") if content is None: content = soup.find('div', class_="left") if content is None: record_error(url, "content container") return None for child in content.stripped_strings: if not child.isspace(): text_list.append(child.strip()) # remove special characters and additional spaces text = " ".join(text_list) text = replace_special(text) return text
def get_meta_soup(soup, url): # find the title from respond try: title = soup.title.text.strip().replace(" ", " ") title = title.encode('utf-8') title = replace_special(title) except AttributeError: record_error(url, "title") title = "none" except TypeError: record_error(url, "title") title = "none" # get description from respond try: description = soup.find("meta", {'name': "description"}) description = description.get("content").strip().encode('utf-8') description = replace_special(description) while description.find(" ") != -1: description = description.replace(" ", " ") if description == "": description = "none" except TypeError: description = "none" except AttributeError: description = "none" # get the keywords from respond try: keywords = soup.find("meta", {'name': "keywords"}) keywords = (" ".join( keywords.get('content').strip().splitlines())).encode('utf-8') keywords = replace_special(keywords) while keywords.find(" ") != -1: keywords = keywords.replace(" ", " ") if keywords is None: keywords = "none" except TypeError: keywords = "none" except AttributeError: keywords = "none" dic = {'title': title, 'description': description, 'keywords': keywords} return dic
def compare_content_soup(old_soup, new_soup, old_url, new_url): page_pass = True old_content = get_content_soup_old(old_soup, old_url) new_content = get_content_soup_new(new_soup, new_url) if old_content is None: old_content = "" if new_content is None: new_content = "" if not old_content and new_content.startswith("We are currently"): return True if not old_content: old_content = "" if not new_content: new_content = "" old_content = replace_special(old_content) new_content = replace_special(new_content) if old_content.replace(" ", "") != new_content.replace(" ", ""): detail = open("result\\content_detail.txt", 'a') entry_print("***********************************************") entry_print("CONTENT DIFFERENCE FOUND!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) entry_print("Old content: " + old_content) entry_print("New content: " + new_content) entry_print("***********************************************") detail.write( "----------------------------------------------------------------------------------------------\n" ) detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") detail.write("Old content: " + old_content + "\n") detail.write("New content: " + new_content + "\n") detail.close() page_pass = False return page_pass
def get_blog_content(soup): # remove all the <style> and <script> tags if soup: for (name, kwargs) in settings["GET_BLOG_CONTENT_IGNORE"]: for s in soup.find_all(name, **kwargs): s.extract() if soup is None: return None text_list = [] for child in soup.stripped_strings: if not child.isspace(): text_list.append(child.strip()) # remove special characters and additional spaces text = " ".join(text_list) text = replace_special(text) return text
def check_homepage_link(old_soup, new_soup, old_url, new_url, browser=None): detail = open("result\\homepage_detail.txt", 'a') old_hostname = urlparse(old_url).hostname new_hostname = urlparse(new_url).hostname page_pass = True printable = set(string.printable) new_content = new_soup.find('div', class_="ptl_page") if old_url.endswith("/"): old_url = old_url[:-1] if new_url.endswith("/"): new_url = new_url[:-1] if not old_hostname: old_hostname = old_url if not new_hostname: new_hostname = new_url if not new_content: record_error(new_url, "new homepage container") return False # remove banner and navigation menu from soup if new_content: for (name, kwargs) in settings["HOMEPAGE_LINK_IGNORE"]: for s in new_content.find_all(name, **kwargs): s.extract() new_tags = new_content.find_all('a', href=re.compile("^(?!.*(#aftermap|#)).*$")) # check for new links that direct to old site host_link = old_url.replace(urlparse(old_url).path, "") domain = get_domain(old_url) for tag in new_tags: href = tag['href'] href_hostname = urlparse(href).hostname if href_hostname is None: href_hostname = "" if href.startswith("/"): continue if (href.startswith(host_link) and host_link != "") \ or (href_hostname.find(domain + '.') != -1 and not href.startswith("mailto") and href.find("televox.west.com") == -1) \ or href.find("iapps") != -1: page_pass = False entry_print("***********************************************") entry_print("HOMEPAGE LINKS THAT GO BACK TO OLD SITE!") entry_print("New URL: " + new_url) detail.write("HOMEPAGE LINKS THAT GO BACK TO OLD SITE!\n") detail.write("New URL: " + new_url + "\n") entry_print("Bad tag: " + str(tag)) entry_print("***********************************************") detail.write("Bad tag: " + str(tag) + "\n") detail.write("-----------------------------------------------\n") if href.find("televox.west.com") != -1: page_pass = False entry_print("***********************************************") entry_print("NON-FRIENDLY URL FOUND! ") entry_print("New URL: " + new_url) detail.write("NON-FRIENDLY URL FOUND!\n") detail.write("New URL: " + new_url + "\n") entry_print("Bad tag: " + str(tag)) entry_print("***********************************************") detail.write("Bad tag: " + str(tag) + "\n") detail.write("-----------------------------------------------\n") # check invalid links in new site new_invalid_links = [] for tag in new_tags: url = tag.get('href') if url is None: continue if url.startswith("https://"): continue if url.startswith("tel:") or url.startswith( "mailto:") or url.find("#") != -1 or url.startswith("/common"): continue if url.startswith("/"): url = "http://" + new_hostname + url if url.find("televox.west.com") != -1: new_target = get_soup(url, browser) else: new_target = get_soup(url) new_target_title = get_meta_soup(new_target, url)['title'] if new_target_title.find("404") != -1 or new_target_title == "Page Not Found" or new_target_title == "none" \ or new_target_title == "The resource cannot be found.": new_invalid_links.append((str(tag), new_target_title)) if new_invalid_links: entry_print("***********************************************") entry_print("INVALID LINK FOUND IN HOMEPAGE!") entry_print("New URL: " + new_url) detail.write("-----------------------------------------------\n") detail.write("INVALID LINK FOUND IN HOMEPAGE!\n") detail.write("New URL: " + new_url + "\n") ind = 0 for tag, target in new_invalid_links: ind += 1 entry_print("Bad tag" + str(ind) + ": " + tag) entry_print("Target title: " + target) detail.write("Bad tag" + str(ind) + ": " + tag + "\n") detail.write("Target title: " + target + "\n") entry_print("***********************************************") # check published links for homepage old_publish = old_soup.find('nav', id="utility-navigation") new_publish = new_soup.find('nav', id="utility-navigation") if old_publish: old_published_links = old_publish.find_all( 'a', href=re.compile("^((?!#).)*$")) else: old_published_links = [] if new_publish: new_published_links = new_publish.find_all( 'a', href=re.compile("^((?!#).)*$")) else: new_published_links = [] if len(old_published_links) != len(new_published_links): entry_print("***********************************************") entry_print("NUMBER OF PUBLISHED LINKS DIFFERENT!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) entry_print("Number of old links: " + str(len(old_published_links))) entry_print("Number of new links: " + str(len(new_published_links))) entry_print("***********************************************") detail.write("NUMBER OF PUBLISHED LINKS DIFFERENT!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") detail.write("Number of old links: " + str(len(old_published_links)) + "\n") detail.write("Number of new links: " + str(len(new_published_links)) + "\n") detail.write("-----------------------------------------------\n") page_pass = False else: publish_pass = True # check the href and name for each published link for ind in range(len(new_published_links)): old_link = old_published_links[ind]['href'] new_link = new_published_links[ind]['href'] old_link_dup = old_link.replace(" ", "").replace("-", "").replace( "(", "").replace(")", "") new_link_dup = new_link.replace(" ", "").replace("-", "").replace( "(", "").replace(")", "") old_name = old_published_links[ind].get_text().replace(" ", " ") new_name = new_published_links[ind].get_text().replace(" ", " ") old_name = "".join([i for i in old_name if i in printable]).strip().upper() new_name = "".join([i for i in new_name if i in printable]).strip().upper() old_name_dup = old_name.replace(" ", "").replace("-", "").replace( "(", "").replace(")", "") new_name_dup = new_name.replace(" ", "").replace("-", "").replace( "(", "").replace(")", "") if old_link_dup != new_link_dup: if old_link.startswith("tel:") or old_link.startswith( "mailto:") or unicode(old_link[0]).isnumeric(): continue if old_link.startswith("/"): old_link = old_hostname + old_link if new_link.startswith("/"): new_link = new_hostname + new_link old_target = get_soup(old_link) new_target = get_soup(new_link, browser=browser) old_target_title = get_meta_soup(old_target, old_link)['title'] new_target_title = get_meta_soup(new_target, new_link)['title'] if new_target_title.endswith("..."): new_target_title = new_target_title[:-3] old_target_title = old_target_title[:len(new_target_title)] if old_target_title != new_target_title: if publish_pass: entry_print( "***********************************************") entry_print("PUBLISHED LINKS DO NOT MATCH!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("PUBLISHED LINKS DO NOT MATCH!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") publish_pass = False page_pass = False entry_print("Old target: " + old_target_title) entry_print("New target: " + new_target_title) entry_print("Old link: " + old_link) entry_print("New link: " + new_link) detail.write("Old target: " + old_target_title + "\n") detail.write("New target: " + new_target_title + "\n") detail.write("Old link: " + old_link + "\n") detail.write("New link: " + new_link + "\n") if old_name_dup != new_name_dup: if publish_pass: entry_print( "***********************************************") entry_print("PUBLISHED LINK NAMES DO NOT MATCH!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("PUBLISHED LINK NAMES DO NOT MATCH!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") publish_pass = False page_pass = False entry_print("Old name: " + old_name) entry_print("New name: " + new_name) detail.write("Old name: " + old_name + "\n") detail.write("New name: " + new_name + "\n") if not publish_pass: entry_print("***********************************************") detail.write("-----------------------------------------------\n") # check social media links for homepage old_social = old_soup.find('nav', class_="social-navigation") new_social = new_soup.find('nav', class_="social-navigation") if old_social: old_social_links = old_social.find_all('a') else: old_social_links = [] if new_social: new_social_links = new_social.find_all('a') else: new_social_links = [] if len(old_social_links) != len(new_social_links): entry_print("***********************************************") entry_print("NUMBER OF SOCIAL LINKS DIFFERENT!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) entry_print("Number of old links: " + str(len(old_social_links))) entry_print("Number of new links: " + str(len(new_social_links))) entry_print("***********************************************") detail.write("NUMBER OF SOCIAL LINKS DIFFERENT!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") detail.write("Number of old links: " + str(len(old_social_links)) + "\n") detail.write("Number of new links: " + str(len(new_social_links)) + "\n") detail.write("-----------------------------------------------\n") page_pass = False else: social_pass = True # check the href and name for each social link for ind in range(len(new_social_links)): old_link = old_social_links[ind]['href'] new_link = new_social_links[ind]['href'] old_link_reversed = old_social_links[len(old_social_links) - ind - 1]['href'] if old_link != new_link and old_link_reversed != new_link: if new_link.startswith("/"): new_link = new_hostname + new_link if old_link.startswith("/"): old_link = old_hostname + old_link old_target = get_soup(old_link) new_target = get_soup(new_link) old_target_title = replace_special( get_meta_soup(old_target, old_link)['title']) new_target_title = replace_special( get_meta_soup(new_target, new_link)['title']) if new_target_title.endswith("..."): new_target_title = new_target_title[:-3] old_target_title = old_target_title[:len(new_target_title)] if old_target_title != new_target_title: if social_pass: entry_print( "***********************************************") entry_print("SOCIAL LINKS DO NOT MATCH!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("SOCIAL LINKS DO NOT MATCH!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") social_pass = False entry_print("Old target: " + old_target_title) entry_print("New target: " + new_target_title) entry_print("Old link: " + old_link) entry_print("New link: " + new_link) detail.write("Old target: " + old_target_title + "\n") detail.write("New target: " + new_target_title + "\n") detail.write("Old link: " + old_link + "\n") detail.write("New link: " + new_link + "\n") if not social_pass: entry_print("***********************************************") detail.write("-----------------------------------------------\n") detail.close() return page_pass
def compare_homepage_content(old_soup, new_soup, old_url, new_url): detail = open("result\\homepage_detail.txt", 'a') page_pass = True old_container1 = old_soup.find('div', id="content") old_container2 = old_soup.find('div', id="features") new_container1 = new_soup.find('div', id="content") new_container2 = new_soup.find('div', id="features") if old_container1: for (name, kwargs) in settings["OLD_HOMEPAGE_CONTENT_IGNORE"]: for s in old_container1.find_all(name, **kwargs): s.extract() if old_container2: for (name, kwargs) in settings["OLD_HOMEPAGE_CONTENT_IGNORE"]: for s in old_container2.find_all(name, **kwargs): s.extract() if new_container1: for (name, kwargs) in settings["NEW_HOMEPAGE_CONTENT_IGNORE"]: for s in new_container1.find_all(name, **kwargs): s.extract() if new_container2: for (name, kwargs) in settings["NEW_HOMEPAGE_CONTENT_IGNORE"]: for s in new_container2.find_all(name, **kwargs): s.extract() old_content = get_homepage_content(old_container1) + get_homepage_content( old_container2) new_content = get_homepage_content(new_container1) + get_homepage_content( new_container2) if new_content.find("Read More") != -1: entry_print("***********************************************") entry_print("HOMEPAGE CONTAINS 'READ MORE'!") entry_print("New URL: " + new_url) entry_print("***********************************************") detail.write("HOMEPAGE CONTAINS READ MORE!\n") detail.write("New URL: " + new_url + "\n") detail.write("-----------------------------------------------\n") page_pass = False old_content = old_content.replace("Read More", "Learn More").replace( "Read more", "Learn More") old_content = old_content.replace("...", "").replace(">", "") new_content = new_content.replace("...", "").replace(">", "") old_content = old_content.replace("Learn More", "").replace("Learn more", "") new_content = new_content.replace("Learn More", "") old_content = replace_special(old_content) new_content = replace_special(new_content) if not old_content and new_content: record_error(old_url, "homepage container") detail.close() return False elif old_content and not new_content: record_error(new_url, "homepage container") detail.close() return False if old_content.replace(" ", "") != new_content.replace(" ", ""): entry_print("***********************************************") entry_print("HOMEPAGE CONTENT DIFFERENCE FOUND!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) entry_print("Old content: " + old_content) entry_print("New content: " + new_content) entry_print("***********************************************") detail.write("HOMEPAGE CONTENT DIFFERENCE FOUND!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") detail.write("Old content: " + old_content + "\n") detail.write("New content: " + new_content + "\n") detail.write("-----------------------------------------------\n") page_pass = False detail.close() return page_pass
def compare_meta_soup(old_soup, new_soup, old_url, new_url): page_pass = True file = open("result\\meta_detail.txt", "a") old_meta = get_meta_soup(old_soup, old_url) new_meta = get_meta_soup(new_soup, new_url) old_title = replace_special(old_meta['title']) new_title = replace_special(new_meta['title']) old_desc = replace_special(old_meta['description']) new_desc = replace_special(new_meta['description']) old_key = replace_special(old_meta['keywords']) new_key = replace_special(new_meta['keywords']) # ignore the omitted content if new_title.endswith("..."): new_title = new_title[:len(new_title) - 3] old_title = old_title[:len(new_title)] if old_desc.startswith("Learn more about"): old_desc = "none" while old_title.find(" ") != -1: old_title = old_title.replace(" ", " ") while new_title.find(" ") != -1: new_title = new_title.replace(" ", " ") title_same = old_title == new_title desc_same = old_desc == new_desc key_same = old_key == new_key # if the old page does not exist, then skip the site # (a new space page will be created in the new site) if old_title == "The resource cannot be found." or old_title.startswith( "404"): title_same = True desc_same = True key_same = True if old_title.lower() != "page not found" and new_title == "Page Not Found": entry_print("***********************************************") entry_print("MISSING PAGE FOUND!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) entry_print("***********************************************") file.write("-----------------------------------------------\n") file.write("MISSING PAGE FOUND!\n") file.write("Old URL: " + old_url + "\n") file.write("New URL: " + new_url + "\n") file.close() return -1 if not (title_same and desc_same and key_same): # print and record the issue in meta.txt file.write("-----------------------------------------------\n") file.write("Old URL: " + old_url + "\n") file.write("New URL: " + new_url + "\n") entry_print("***********************************************") entry_print("METADATA DIFFERENCE FOUND!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) if not title_same: entry_print("Old title: " + old_title) entry_print("New title: " + new_title) file.write("Old title: " + old_title + "\n") file.write("New title: " + new_title + "\n") if not desc_same: entry_print("Old description: " + old_desc) entry_print("New description: " + new_desc) file.write("Old description: " + old_desc + "\n") file.write("New description: " + new_desc + "\n") if not key_same: entry_print("Old keywords: " + old_key) entry_print("New keywords: " + new_key) file.write("Old keywords: " + old_key + "\n") file.write("New keywords: " + new_key + "\n") entry_print("***********************************************") page_pass = False file.close() return page_pass
def compare_link_soup(old_soup, new_soup, old_url, new_url, browser=None): detail = open("result\\site_detail.txt", 'a') old_hostname = urlparse(old_url).hostname new_hostname = urlparse(new_url).hostname if not old_hostname: old_hostname = "" if not new_hostname: new_hostname = "" # grab container old_content = old_soup.find('div', class_="right") new_content = new_soup.find('div', class_="right") if not old_content and new_content: if old_soup.find('div', id="content"): old_content = old_soup.find('div', id="content") if not old_content and new_content: record_error(old_url, "link container") detail.close() return False elif old_content and not new_content: record_error(new_url, "link container") detail.close() return False elif not old_content and not new_content: return True # vertical template uses different container if old_content.find('div', id="content"): old_content = old_soup.find('div', id="content") if new_content.find('div', id="content"): new_content = new_soup.find('div', id="content") # remove extra links from container if old_content: for (name, kwargs) in settings["COMPARE_OLD_LINK_IGNORE"]: for s in old_content.find_all(name, **kwargs): s.extract() if new_content: for (name, kwargs) in settings["COMPARE_NEW_LINK_IGNORE"]: for s in new_content.find_all(name, kwargs): s.extract() if old_content is None: old_tags = [] else: old_tags = old_content.find_all('a', href=True) if new_content is None: new_tags = [] else: new_tags = new_content.find_all('a', href=True) # remove links that does not have any content inside old_tags = [ tag for tag in old_tags if tag.text and not tag.text.isspace() or tag.find('img') ] new_tags = [ tag for tag in new_tags if tag.text and not tag.text.isspace() or tag.find('img') ] # check for new links that direct to old site host_link = old_url.replace(urlparse(old_url).path, "") domain = get_domain(old_url) new_pass1 = True for tag in new_tags: href = tag['href'] href_hostname = urlparse(href).hostname if not href_hostname: href_hostname = "" if href.find(host_link) != -1 or (href_hostname.find(domain + '.') != -1 and href.find("televox.west.com") == -1) \ or href.find("iapps") != -1: if new_pass1: entry_print("***********************************************") entry_print("LINKS THAT GO BACK TO OLD SITE!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("LINKS THAT GO BACK TO OLD SITE!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") new_pass1 = False entry_print("Bad tag: " + str(tag)) detail.write("Bad tag: " + str(tag) + "\n") if not new_pass1: entry_print("***********************************************") detail.write("-----------------------------------------------\n") # check for non-friendly urls new_pass2 = True for tag in new_tags: href = tag['href'] if not href: continue if href.find("televox.west.com") != -1: if new_pass2: entry_print("***********************************************") entry_print("NON-FRIENDLY URL FOUND!") entry_print("New URL: " + new_url) detail.write("NON-FRIENDLY URL FOUND!\n") detail.write("New URL: " + new_url + "\n") new_pass2 = False entry_print("Bad tag: " + str(tag)) detail.write("Bad tag: " + str(tag) + "\n") if not new_pass2: entry_print("***********************************************") detail.write("-----------------------------------------------\n") # remove file links for tag in old_tags: url = tag.get('href') if re.search("jpg|png|pdf|mp4", url): old_tags.remove(tag) for tag in new_tags: url = tag.get('href') if re.search("jpg|png|pdf|mp4|UserFile", url): new_tags.remove(tag) bad_tags = [] if len(old_tags) != len(new_tags): # remove 404 pages and file links from the old tags for tag in old_tags: url = tag.get('href') if url is None: continue if url.startswith("https://"): continue if url.startswith("tel:") or url.startswith( "mailto:") or url.find("#") != -1: continue if url.startswith("/"): url = "http://" + old_hostname + url old_target = get_soup(url) old_target_title = get_meta_soup(old_target, url)['title'] if old_target_title.find("404") != -1 \ or re.search("page not found|the resource cannot be found", old_target_title.lower()) \ or old_target_title == "none": bad_tags.append((str(tag), old_target_title)) old_tags.remove(tag) # check invalid links in new site new_invalid_links = [] for tag in new_tags: url = tag.get('href') if url is None: continue if url.startswith("https://"): continue if url.startswith("tel:") or url.startswith( "mailto:") or url.find("#") != -1 or url.startswith("/common"): continue if url.startswith("/"): url = "http://" + new_hostname + url if url.find("televox.west.com") != -1: new_target = get_soup(url, browser) else: new_target = get_soup(url) new_target_title = get_meta_soup(new_target, url)['title'] if new_target_title.find("404") != -1 or new_target_title == "Page Not Found" or new_target_title == "none" \ or new_target_title == "The resource cannot be found.": new_invalid_links.append((str(tag), new_target_title)) if new_invalid_links: entry_print("***********************************************") entry_print("INVALID LINK FOUND IN NEW SITE!") entry_print("New URL: " + new_url) detail.write("INVALID LINK FOUND IN NEW SITE!\n") detail.write("New URL: " + new_url + "\n") ind = 0 for tag, target in new_invalid_links: ind += 1 entry_print("Bad tag" + str(ind) + ": " + tag) entry_print("Target title: " + target) detail.write("Bad tag" + str(ind) + ": " + tag + "\n") detail.write("Target title: " + target + "\n") entry_print("***********************************************") detail.write("-----------------------------------------------\n") # check that number of links match if not, return if len(new_tags) != len(old_tags): entry_print("***********************************************") entry_print( "NUMBER OF LINKS DIFFERENT OR 404 LINK EXISTS IN NEW PAGE!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) entry_print("Number of old links: " + str(len(old_tags))) entry_print("Number of new links: " + str(len(new_tags))) entry_print("Old tags: " + str(old_tags)) entry_print("New tags: " + str(new_tags)) if bad_tags: entry_print("404 tags in old site (removed):") for ind in range(len(bad_tags)): entry_print("Tag" + str(ind + 1) + ": " + bad_tags[ind][0]) entry_print("Target title: " + bad_tags[ind][1]) entry_print("***********************************************") detail.write( "NUMBER OF LINKS DIFFERENT OR 404 LINK EXISTS IN NEW PAGE!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") detail.write("Number of old links: " + str(len(old_tags)) + "\n") detail.write("Number of new links: " + str(len(new_tags)) + "\n") if bad_tags: detail.write("404 tag(s) in old site (removed):\n") for ind in range(len(bad_tags)): detail.write("Tag" + str(ind + 1) + ": " + bad_tags[ind][0] + "\n") detail.write("Target title: " + bad_tags[ind][1] + "\n") entry_print("***********************************************") detail.write("-----------------------------------------------\n") detail.close() return False # check that new and old links match new_pass3 = True count = 0 for ind in range(len(new_tags)): old_link = old_tags[ind]['href'].replace("\\", "/").strip() new_link = new_tags[ind]['href'].replace("\\", "/").strip() if old_link == new_link: continue # take out the duplication part for old_link if old_link.find("#") != -1: old_ind = old_link.find("#") old_link = old_link[old_ind:] if new_link.find("#") != -1: new_ind = new_link.find("#") new_link = new_link[new_ind:] temp = old_link.split("/") if len(temp) > 2: if temp[-1] == temp[-2]: old_link = "/".join(temp[:-1]) if urlparse(old_link).path == urlparse(new_link).path: continue if old_link.startswith("/"): old_link = "http://" + old_hostname + old_link # if the old link points to the homepage, then set it as "/" if old_link.endswith("/home") or old_link.endswith("/main"): old_link = "/" if new_link == "/home" or new_link == "/main": new_link = "/" if new_link != "/" and new_link.endswith("/"): new_link = new_link[:-1] if old_link != "/" and old_link.endswith("/"): old_link = old_link[:-1] if old_link != new_link and not new_link.startswith("/common"): if old_link.find("#") != -1 or new_link.find("#") != -1: count += 1 if new_pass3: entry_print( "***********************************************") entry_print("LINKS THAT DO NOT MATCH!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("LINKS THAT DO NOT MATCH!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") new_pass3 = False entry_print("Old link" + str(count) + ": " + old_link) entry_print("New link" + str(count) + ": " + new_link) entry_print("Old tag" + str(count) + ": " + str(old_tags[ind])) entry_print("New tag" + str(count) + ": " + str(new_tags[ind])) detail.write("Old tag" + str(count) + ": " + str(old_tags[ind]) + "\n") detail.write("New tag" + str(count) + ": " + str(new_tags[ind]) + "\n") continue if old_link.startswith("/"): old_link = "http://" + old_hostname + old_link.strip() if new_link.startswith("/"): new_link = "http://" + new_hostname + new_link.strip() old_target = get_soup(old_link) new_target = get_soup(new_link, browser=browser) old_target_title = replace_special( get_meta_soup(old_target, old_link)['title']) new_target_title = replace_special( get_meta_soup(new_target, new_link)['title']) if new_target_title.endswith("..."): new_target_title = new_target_title[:-3] old_target_title = old_target_title[:len(new_target_title)] if old_target_title != new_target_title: count += 1 if new_pass3: entry_print( "***********************************************") entry_print("LINKS THAT DO NOT MATCH!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("LINKS THAT DO NOT MATCH!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") new_pass3 = False entry_print("Old link" + str(count) + ": " + old_link) entry_print("New link" + str(count) + ": " + new_link) entry_print("Old target" + str(count) + ": " + old_target_title) entry_print("New target" + str(count) + ": " + new_target_title) entry_print("Old tag" + str(count) + ": " + str(old_tags[ind])) entry_print("New tag" + str(count) + ": " + str(new_tags[ind])) detail.write("Old tag" + str(count) + ": " + str(old_tags[ind]) + "\n") detail.write("New tag" + str(count) + ": " + str(new_tags[ind]) + "\n") if not new_pass3: detail.write("-----------------------------------------------\n") entry_print("***********************************************") detail.close() return new_pass1 and new_pass2 and new_pass3
def compare_form_soup(old_soup, new_soup, old_url, new_url): detail = open("result\\form_detail.txt", 'a') require_pass = True title_pass = True entry_pass = True auth_pass = True old_container = old_soup.find('div', class_="form-container") new_container = new_soup.find('div', class_="secureform") if not old_container and not new_container: detail.close() return True elif not old_container and new_container: record_error(old_url, "form container") detail.close() return False elif old_container and not new_container: record_error(new_url, "form container") detail.close() return False # check the "required field" text in new form if not new_container.find(text=re.compile("required field")): entry_print("***********************************************") entry_print("FORM MISSING '* REQUIRED FIELD' TITLE!") entry_print("New URL: " + new_url) entry_print("***********************************************") detail.write("FORM MISSING '* REQUIRED FIELD' TITLE!\n") detail.write("New URL: " + new_url + "\n") detail.write("-----------------------------------------------\n") require_pass = False # find all the entry names and choices from old page if old_container: for (name, kwargs) in settings["OLD_FORM_ENTRY_IGNORE"]: for s in old_container.find_all(name, **kwargs): s.extract() old_entries = [replace_special(text) for text in old_container.stripped_strings] # find all the entry names and choices from new page if new_container: for (name, kwargs) in settings["NEW_FORM_ENTRY_IGNORE"]: for s in new_container.find_all(name, **kwargs): s.extract() new_entries = [replace_special(text) for text in new_container.stripped_strings] # compare entry names if len(old_entries) != len(new_entries): entry_print("***********************************************") entry_print("NUMBER OF FORM ENTRIES DIFFERENT!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) entry_print("Number of old entries: " + str(len(old_entries))) entry_print("Number of new entries: " + str(len(new_entries))) entry_print("Old entries: " + str(old_entries)) entry_print("New entries: " + str(new_entries)) detail.write("NUMBER OF FORM ENTRIES DIFFERENT!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") detail.write("Number of old entries: " + str(len(old_entries)) + "\n") detail.write("Number of new entries: " + str(len(new_entries)) + "\n") detail.write("Old entries: " + str(old_entries) + "\n") detail.write("New entries: " + str(new_entries) + "\n") entry_pass = False # try to track down the issue for ind in range(min(len(old_entries), len(new_entries))): if old_entries[ind] != new_entries[ind] and old_entries[ind].upper() != new_entries[ind].upper(): entry_print("FIRST DIFFERENCE:") entry_print("Old entry name: " + old_entries[ind]) entry_print("New entry name: " + new_entries[ind]) detail.write("FIRST DIFFERENCE:\n") detail.write("Old entry name: " + old_entries[ind] + "\n") detail.write("New entry name: " + new_entries[ind] + "\n") break entry_print("***********************************************") detail.write("-----------------------------------------------\n") else: # compare each entry count = 0 old_diff = [] new_diff = [] new_pass = True for ind in range(len(old_entries)): old_entry = old_entries[ind] new_entry = new_entries[ind] if old_entry != new_entry and old_entry.upper() != new_entry.upper(): old_diff.append(old_entry) new_diff.append(new_entry) old_diff_dup = [i.upper() for i in old_diff] new_diff_dup = [i.upper() for i in new_diff] old_diff_dup.sort() new_diff_dup.sort() for old_entry, new_entry in zip(old_diff_dup, new_diff_dup): if old_entry != new_entry: new_pass = False break if not new_pass: for old_entry, new_entry in zip(old_diff, new_diff): count += 1 if entry_pass: entry_print("***********************************************") entry_print("FORM ENTRIES DO NOT MATCH!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("FORM ENTRIES DO NOT MATCH!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") entry_pass = False entry_print("Old entry name" + str(count) + ": " + old_entry) entry_print("New entry name" + str(count) + ": " + new_entry) detail.write("Old entry name" + str(count) + ": " + old_entry + "\n") detail.write("New entry name" + str(count) + ": " + new_entry + "\n") entry_print("***********************************************") detail.write("-----------------------------------------------\n") detail.close() return require_pass and title_pass and entry_pass and auth_pass