Python replace_specialの例

プログラミング言語: Python

名前空間/パッケージ名: QA_util

メソッド/関数: replace_special

hotexamples.comのコード掲載数: 9

Python replace_special - 9件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのQA_util.replace_specialの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def get_content_soup_new(soup, url):
    if not soup:
        record_error(url, "content container")
        return False

    # remove comment and map from soup
    if soup:
        for (name, kwargs) in settings["GET_NEW_CONTENT_IGNORE"]:
            for s in soup.find_all(name, **kwargs):
                s.extract()

    content = soup.find('div', class_="right")
    text_list = []

    if content is not None and url.find("form") == -1:
        if content.get_text() == "" or content.get_text().isspace():
            content = soup.find('div', id="content")

    if content is None or content.find('div', id="content"):
        content = soup.find('div', id="content")
    if content is None:
        content = soup.find('div', class_="left")
    if content is None:
        record_error(url, "content container")
        return None

    for child in content.stripped_strings:
        if not child.isspace():
            text_list.append(child.strip())
    # remove special characters and additional spaces
    text = " ".join(text_list)
    text = replace_special(text)
    return text

コード例 #2

ファイルを表示

def get_meta_soup(soup, url):
    # find the title from respond
    try:
        title = soup.title.text.strip().replace("  ", " ")
        title = title.encode('utf-8')
        title = replace_special(title)
    except AttributeError:
        record_error(url, "title")
        title = "none"
    except TypeError:
        record_error(url, "title")
        title = "none"

    # get description from respond
    try:
        description = soup.find("meta", {'name': "description"})
        description = description.get("content").strip().encode('utf-8')
        description = replace_special(description)

        while description.find("  ") != -1:
            description = description.replace("  ", " ")
        if description == "":
            description = "none"
    except TypeError:
        description = "none"
    except AttributeError:
        description = "none"

    # get the keywords from respond
    try:
        keywords = soup.find("meta", {'name': "keywords"})
        keywords = (" ".join(
            keywords.get('content').strip().splitlines())).encode('utf-8')
        keywords = replace_special(keywords)

        while keywords.find("  ") != -1:
            keywords = keywords.replace("  ", " ")

        if keywords is None:
            keywords = "none"
    except TypeError:
        keywords = "none"
    except AttributeError:
        keywords = "none"

    dic = {'title': title, 'description': description, 'keywords': keywords}
    return dic

コード例 #3

ファイルを表示

ファイル: QA_compare_normal.py プロジェクト: k47ma/Scraping-Interface

def compare_content_soup(old_soup, new_soup, old_url, new_url):
    page_pass = True

    old_content = get_content_soup_old(old_soup, old_url)
    new_content = get_content_soup_new(new_soup, new_url)

    if old_content is None:
        old_content = ""
    if new_content is None:
        new_content = ""

    if not old_content and new_content.startswith("We are currently"):
        return True

    if not old_content:
        old_content = ""
    if not new_content:
        new_content = ""

    old_content = replace_special(old_content)
    new_content = replace_special(new_content)

    if old_content.replace(" ", "") != new_content.replace(" ", ""):
        detail = open("result\\content_detail.txt", 'a')
        entry_print("***********************************************")
        entry_print("CONTENT DIFFERENCE FOUND!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Old content: " + old_content)
        entry_print("New content: " + new_content)
        entry_print("***********************************************")
        detail.write(
            "----------------------------------------------------------------------------------------------\n"
        )
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Old content: " + old_content + "\n")
        detail.write("New content: " + new_content + "\n")
        detail.close()
        page_pass = False
    return page_pass

コード例 #4

ファイルを表示

def get_blog_content(soup):
    # remove all the <style> and <script> tags
    if soup:
        for (name, kwargs) in settings["GET_BLOG_CONTENT_IGNORE"]:
            for s in soup.find_all(name, **kwargs):
                s.extract()

    if soup is None:
        return None

    text_list = []
    for child in soup.stripped_strings:
        if not child.isspace():
            text_list.append(child.strip())

    # remove special characters and additional spaces
    text = " ".join(text_list)
    text = replace_special(text)
    return text

コード例 #5

ファイルを表示

def check_homepage_link(old_soup, new_soup, old_url, new_url, browser=None):
    detail = open("result\\homepage_detail.txt", 'a')
    old_hostname = urlparse(old_url).hostname
    new_hostname = urlparse(new_url).hostname
    page_pass = True
    printable = set(string.printable)
    new_content = new_soup.find('div', class_="ptl_page")

    if old_url.endswith("/"):
        old_url = old_url[:-1]
    if new_url.endswith("/"):
        new_url = new_url[:-1]

    if not old_hostname:
        old_hostname = old_url
    if not new_hostname:
        new_hostname = new_url

    if not new_content:
        record_error(new_url, "new homepage container")
        return False

    # remove banner and navigation menu from soup
    if new_content:
        for (name, kwargs) in settings["HOMEPAGE_LINK_IGNORE"]:
            for s in new_content.find_all(name, **kwargs):
                s.extract()

    new_tags = new_content.find_all('a',
                                    href=re.compile("^(?!.*(#aftermap|#)).*$"))

    # check for new links that direct to old site
    host_link = old_url.replace(urlparse(old_url).path, "")
    domain = get_domain(old_url)
    for tag in new_tags:
        href = tag['href']
        href_hostname = urlparse(href).hostname
        if href_hostname is None:
            href_hostname = ""
        if href.startswith("/"):
            continue
        if (href.startswith(host_link) and host_link != "") \
                or (href_hostname.find(domain + '.') != -1 and not href.startswith("mailto") and href.find("televox.west.com") == -1) \
                or href.find("iapps") != -1:
            page_pass = False
            entry_print("***********************************************")
            entry_print("HOMEPAGE LINKS THAT GO BACK TO OLD SITE!")
            entry_print("New URL: " + new_url)
            detail.write("HOMEPAGE LINKS THAT GO BACK TO OLD SITE!\n")
            detail.write("New URL: " + new_url + "\n")
            entry_print("Bad tag: " + str(tag))
            entry_print("***********************************************")
            detail.write("Bad tag: " + str(tag) + "\n")
            detail.write("-----------------------------------------------\n")
        if href.find("televox.west.com") != -1:
            page_pass = False
            entry_print("***********************************************")
            entry_print("NON-FRIENDLY URL FOUND! ")
            entry_print("New URL: " + new_url)
            detail.write("NON-FRIENDLY URL FOUND!\n")
            detail.write("New URL: " + new_url + "\n")
            entry_print("Bad tag: " + str(tag))
            entry_print("***********************************************")
            detail.write("Bad tag: " + str(tag) + "\n")
            detail.write("-----------------------------------------------\n")

    # check invalid links in new site
    new_invalid_links = []
    for tag in new_tags:
        url = tag.get('href')
        if url is None:
            continue
        if url.startswith("https://"):
            continue
        if url.startswith("tel:") or url.startswith(
                "mailto:") or url.find("#") != -1 or url.startswith("/common"):
            continue
        if url.startswith("/"):
            url = "http://" + new_hostname + url
        if url.find("televox.west.com") != -1:
            new_target = get_soup(url, browser)
        else:
            new_target = get_soup(url)
        new_target_title = get_meta_soup(new_target, url)['title']
        if new_target_title.find("404") != -1 or new_target_title == "Page Not Found" or new_target_title == "none" \
                or new_target_title == "The resource cannot be found.":
            new_invalid_links.append((str(tag), new_target_title))

    if new_invalid_links:
        entry_print("***********************************************")
        entry_print("INVALID LINK FOUND IN HOMEPAGE!")
        entry_print("New URL: " + new_url)
        detail.write("-----------------------------------------------\n")
        detail.write("INVALID LINK FOUND IN HOMEPAGE!\n")
        detail.write("New URL: " + new_url + "\n")
        ind = 0
        for tag, target in new_invalid_links:
            ind += 1
            entry_print("Bad tag" + str(ind) + ": " + tag)
            entry_print("Target title: " + target)
            detail.write("Bad tag" + str(ind) + ": " + tag + "\n")
            detail.write("Target title: " + target + "\n")
        entry_print("***********************************************")

    # check published links for homepage
    old_publish = old_soup.find('nav', id="utility-navigation")
    new_publish = new_soup.find('nav', id="utility-navigation")

    if old_publish:
        old_published_links = old_publish.find_all(
            'a', href=re.compile("^((?!#).)*$"))
    else:
        old_published_links = []
    if new_publish:
        new_published_links = new_publish.find_all(
            'a', href=re.compile("^((?!#).)*$"))
    else:
        new_published_links = []

    if len(old_published_links) != len(new_published_links):
        entry_print("***********************************************")
        entry_print("NUMBER OF PUBLISHED LINKS DIFFERENT!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old links: " + str(len(old_published_links)))
        entry_print("Number of new links: " + str(len(new_published_links)))
        entry_print("***********************************************")
        detail.write("NUMBER OF PUBLISHED LINKS DIFFERENT!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old links: " + str(len(old_published_links)) +
                     "\n")
        detail.write("Number of new links: " + str(len(new_published_links)) +
                     "\n")
        detail.write("-----------------------------------------------\n")
        page_pass = False
    else:
        publish_pass = True
        # check the href and name for each published link
        for ind in range(len(new_published_links)):
            old_link = old_published_links[ind]['href']
            new_link = new_published_links[ind]['href']
            old_link_dup = old_link.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            new_link_dup = new_link.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            old_name = old_published_links[ind].get_text().replace("  ", " ")
            new_name = new_published_links[ind].get_text().replace("  ", " ")
            old_name = "".join([i for i in old_name
                                if i in printable]).strip().upper()
            new_name = "".join([i for i in new_name
                                if i in printable]).strip().upper()
            old_name_dup = old_name.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            new_name_dup = new_name.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            if old_link_dup != new_link_dup:
                if old_link.startswith("tel:") or old_link.startswith(
                        "mailto:") or unicode(old_link[0]).isnumeric():
                    continue

                if old_link.startswith("/"):
                    old_link = old_hostname + old_link
                if new_link.startswith("/"):
                    new_link = new_hostname + new_link

                old_target = get_soup(old_link)
                new_target = get_soup(new_link, browser=browser)
                old_target_title = get_meta_soup(old_target, old_link)['title']
                new_target_title = get_meta_soup(new_target, new_link)['title']

                if new_target_title.endswith("..."):
                    new_target_title = new_target_title[:-3]
                    old_target_title = old_target_title[:len(new_target_title)]

                if old_target_title != new_target_title:
                    if publish_pass:
                        entry_print(
                            "***********************************************")
                        entry_print("PUBLISHED LINKS DO NOT MATCH!")
                        entry_print("Old URL: " + old_url)
                        entry_print("New URL: " + new_url)
                        detail.write("PUBLISHED LINKS DO NOT MATCH!\n")
                        detail.write("Old URL: " + old_url + "\n")
                        detail.write("New URL: " + new_url + "\n")
                        publish_pass = False
                        page_pass = False
                    entry_print("Old target: " + old_target_title)
                    entry_print("New target: " + new_target_title)
                    entry_print("Old link: " + old_link)
                    entry_print("New link: " + new_link)
                    detail.write("Old target: " + old_target_title + "\n")
                    detail.write("New target: " + new_target_title + "\n")
                    detail.write("Old link: " + old_link + "\n")
                    detail.write("New link: " + new_link + "\n")
            if old_name_dup != new_name_dup:
                if publish_pass:
                    entry_print(
                        "***********************************************")
                    entry_print("PUBLISHED LINK NAMES DO NOT MATCH!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    detail.write("PUBLISHED LINK NAMES DO NOT MATCH!\n")
                    detail.write("Old URL: " + old_url + "\n")
                    detail.write("New URL: " + new_url + "\n")
                    publish_pass = False
                    page_pass = False
                entry_print("Old name: " + old_name)
                entry_print("New name: " + new_name)
                detail.write("Old name: " + old_name + "\n")
                detail.write("New name: " + new_name + "\n")
        if not publish_pass:
            entry_print("***********************************************")
            detail.write("-----------------------------------------------\n")

    # check social media links for homepage
    old_social = old_soup.find('nav', class_="social-navigation")
    new_social = new_soup.find('nav', class_="social-navigation")

    if old_social:
        old_social_links = old_social.find_all('a')
    else:
        old_social_links = []
    if new_social:
        new_social_links = new_social.find_all('a')
    else:
        new_social_links = []

    if len(old_social_links) != len(new_social_links):
        entry_print("***********************************************")
        entry_print("NUMBER OF SOCIAL LINKS DIFFERENT!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old links: " + str(len(old_social_links)))
        entry_print("Number of new links: " + str(len(new_social_links)))
        entry_print("***********************************************")
        detail.write("NUMBER OF SOCIAL LINKS DIFFERENT!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old links: " + str(len(old_social_links)) +
                     "\n")
        detail.write("Number of new links: " + str(len(new_social_links)) +
                     "\n")
        detail.write("-----------------------------------------------\n")
        page_pass = False
    else:
        social_pass = True
        # check the href and name for each social link
        for ind in range(len(new_social_links)):
            old_link = old_social_links[ind]['href']
            new_link = new_social_links[ind]['href']
            old_link_reversed = old_social_links[len(old_social_links) - ind -
                                                 1]['href']
            if old_link != new_link and old_link_reversed != new_link:
                if new_link.startswith("/"):
                    new_link = new_hostname + new_link
                if old_link.startswith("/"):
                    old_link = old_hostname + old_link

                old_target = get_soup(old_link)
                new_target = get_soup(new_link)
                old_target_title = replace_special(
                    get_meta_soup(old_target, old_link)['title'])
                new_target_title = replace_special(
                    get_meta_soup(new_target, new_link)['title'])

                if new_target_title.endswith("..."):
                    new_target_title = new_target_title[:-3]
                    old_target_title = old_target_title[:len(new_target_title)]

                if old_target_title != new_target_title:
                    if social_pass:
                        entry_print(
                            "***********************************************")
                        entry_print("SOCIAL LINKS DO NOT MATCH!")
                        entry_print("Old URL: " + old_url)
                        entry_print("New URL: " + new_url)
                        detail.write("SOCIAL LINKS DO NOT MATCH!\n")
                        detail.write("Old URL: " + old_url + "\n")
                        detail.write("New URL: " + new_url + "\n")
                        social_pass = False
                    entry_print("Old target: " + old_target_title)
                    entry_print("New target: " + new_target_title)
                    entry_print("Old link: " + old_link)
                    entry_print("New link: " + new_link)
                    detail.write("Old target: " + old_target_title + "\n")
                    detail.write("New target: " + new_target_title + "\n")
                    detail.write("Old link: " + old_link + "\n")
                    detail.write("New link: " + new_link + "\n")
        if not social_pass:
            entry_print("***********************************************")
            detail.write("-----------------------------------------------\n")
    detail.close()
    return page_pass

コード例 #6

ファイルを表示

ファイル: QA_compare_homepage.py プロジェクト: k47ma/Scraping-Interface

def compare_homepage_content(old_soup, new_soup, old_url, new_url):
    detail = open("result\\homepage_detail.txt", 'a')
    page_pass = True
    old_container1 = old_soup.find('div', id="content")
    old_container2 = old_soup.find('div', id="features")
    new_container1 = new_soup.find('div', id="content")
    new_container2 = new_soup.find('div', id="features")

    if old_container1:
        for (name, kwargs) in settings["OLD_HOMEPAGE_CONTENT_IGNORE"]:
            for s in old_container1.find_all(name, **kwargs):
                s.extract()
    if old_container2:
        for (name, kwargs) in settings["OLD_HOMEPAGE_CONTENT_IGNORE"]:
            for s in old_container2.find_all(name, **kwargs):
                s.extract()
    if new_container1:
        for (name, kwargs) in settings["NEW_HOMEPAGE_CONTENT_IGNORE"]:
            for s in new_container1.find_all(name, **kwargs):
                s.extract()
    if new_container2:
        for (name, kwargs) in settings["NEW_HOMEPAGE_CONTENT_IGNORE"]:
            for s in new_container2.find_all(name, **kwargs):
                s.extract()

    old_content = get_homepage_content(old_container1) + get_homepage_content(
        old_container2)
    new_content = get_homepage_content(new_container1) + get_homepage_content(
        new_container2)

    if new_content.find("Read More") != -1:
        entry_print("***********************************************")
        entry_print("HOMEPAGE CONTAINS 'READ MORE'!")
        entry_print("New URL: " + new_url)
        entry_print("***********************************************")
        detail.write("HOMEPAGE CONTAINS READ MORE!\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("-----------------------------------------------\n")
        page_pass = False

    old_content = old_content.replace("Read More", "Learn More").replace(
        "Read more", "Learn More")
    old_content = old_content.replace("...", "").replace(">", "")
    new_content = new_content.replace("...", "").replace(">", "")
    old_content = old_content.replace("Learn More",
                                      "").replace("Learn more", "")
    new_content = new_content.replace("Learn More", "")
    old_content = replace_special(old_content)
    new_content = replace_special(new_content)

    if not old_content and new_content:
        record_error(old_url, "homepage container")
        detail.close()
        return False
    elif old_content and not new_content:
        record_error(new_url, "homepage container")
        detail.close()
        return False

    if old_content.replace(" ", "") != new_content.replace(" ", ""):
        entry_print("***********************************************")
        entry_print("HOMEPAGE CONTENT DIFFERENCE FOUND!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Old content: " + old_content)
        entry_print("New content: " + new_content)
        entry_print("***********************************************")
        detail.write("HOMEPAGE CONTENT DIFFERENCE FOUND!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Old content: " + old_content + "\n")
        detail.write("New content: " + new_content + "\n")
        detail.write("-----------------------------------------------\n")
        page_pass = False
    detail.close()
    return page_pass

コード例 #7

ファイルを表示

ファイル: QA_compare_normal.py プロジェクト: k47ma/Scraping-Interface

def compare_meta_soup(old_soup, new_soup, old_url, new_url):
    page_pass = True
    file = open("result\\meta_detail.txt", "a")

    old_meta = get_meta_soup(old_soup, old_url)
    new_meta = get_meta_soup(new_soup, new_url)

    old_title = replace_special(old_meta['title'])
    new_title = replace_special(new_meta['title'])
    old_desc = replace_special(old_meta['description'])
    new_desc = replace_special(new_meta['description'])
    old_key = replace_special(old_meta['keywords'])
    new_key = replace_special(new_meta['keywords'])

    # ignore the omitted content
    if new_title.endswith("..."):
        new_title = new_title[:len(new_title) - 3]
        old_title = old_title[:len(new_title)]

    if old_desc.startswith("Learn more about"):
        old_desc = "none"

    while old_title.find("  ") != -1:
        old_title = old_title.replace("  ", " ")
    while new_title.find("  ") != -1:
        new_title = new_title.replace("  ", " ")

    title_same = old_title == new_title
    desc_same = old_desc == new_desc
    key_same = old_key == new_key

    # if the old page does not exist, then skip the site
    # (a new space page will be created in the new site)
    if old_title == "The resource cannot be found." or old_title.startswith(
            "404"):
        title_same = True
        desc_same = True
        key_same = True

    if old_title.lower() != "page not found" and new_title == "Page Not Found":
        entry_print("***********************************************")
        entry_print("MISSING PAGE FOUND!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("***********************************************")
        file.write("-----------------------------------------------\n")
        file.write("MISSING PAGE FOUND!\n")
        file.write("Old URL: " + old_url + "\n")
        file.write("New URL: " + new_url + "\n")
        file.close()
        return -1

    if not (title_same and desc_same and key_same):
        # print and record the issue in meta.txt
        file.write("-----------------------------------------------\n")
        file.write("Old URL: " + old_url + "\n")
        file.write("New URL: " + new_url + "\n")
        entry_print("***********************************************")
        entry_print("METADATA DIFFERENCE FOUND!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        if not title_same:
            entry_print("Old title: " + old_title)
            entry_print("New title: " + new_title)
            file.write("Old title: " + old_title + "\n")
            file.write("New title: " + new_title + "\n")
        if not desc_same:
            entry_print("Old description: " + old_desc)
            entry_print("New description: " + new_desc)
            file.write("Old description: " + old_desc + "\n")
            file.write("New description: " + new_desc + "\n")
        if not key_same:
            entry_print("Old keywords: " + old_key)
            entry_print("New keywords: " + new_key)
            file.write("Old keywords: " + old_key + "\n")
            file.write("New keywords: " + new_key + "\n")
        entry_print("***********************************************")
        page_pass = False
    file.close()
    return page_pass

コード例 #8

ファイルを表示

ファイル: QA_compare_normal.py プロジェクト: k47ma/Scraping-Interface

def compare_link_soup(old_soup, new_soup, old_url, new_url, browser=None):
    detail = open("result\\site_detail.txt", 'a')
    old_hostname = urlparse(old_url).hostname
    new_hostname = urlparse(new_url).hostname

    if not old_hostname:
        old_hostname = ""
    if not new_hostname:
        new_hostname = ""

    # grab container
    old_content = old_soup.find('div', class_="right")
    new_content = new_soup.find('div', class_="right")

    if not old_content and new_content:
        if old_soup.find('div', id="content"):
            old_content = old_soup.find('div', id="content")

    if not old_content and new_content:
        record_error(old_url, "link container")
        detail.close()
        return False
    elif old_content and not new_content:
        record_error(new_url, "link container")
        detail.close()
        return False
    elif not old_content and not new_content:
        return True

    # vertical template uses different container
    if old_content.find('div', id="content"):
        old_content = old_soup.find('div', id="content")

    if new_content.find('div', id="content"):
        new_content = new_soup.find('div', id="content")

    # remove extra links from container
    if old_content:
        for (name, kwargs) in settings["COMPARE_OLD_LINK_IGNORE"]:
            for s in old_content.find_all(name, **kwargs):
                s.extract()

    if new_content:
        for (name, kwargs) in settings["COMPARE_NEW_LINK_IGNORE"]:
            for s in new_content.find_all(name, kwargs):
                s.extract()

    if old_content is None:
        old_tags = []
    else:
        old_tags = old_content.find_all('a', href=True)

    if new_content is None:
        new_tags = []
    else:
        new_tags = new_content.find_all('a', href=True)

    # remove links that does not have any content inside
    old_tags = [
        tag for tag in old_tags
        if tag.text and not tag.text.isspace() or tag.find('img')
    ]
    new_tags = [
        tag for tag in new_tags
        if tag.text and not tag.text.isspace() or tag.find('img')
    ]

    # check for new links that direct to old site
    host_link = old_url.replace(urlparse(old_url).path, "")
    domain = get_domain(old_url)
    new_pass1 = True
    for tag in new_tags:
        href = tag['href']
        href_hostname = urlparse(href).hostname
        if not href_hostname:
            href_hostname = ""
        if href.find(host_link) != -1 or (href_hostname.find(domain + '.') != -1 and href.find("televox.west.com") == -1) \
                or href.find("iapps") != -1:
            if new_pass1:
                entry_print("***********************************************")
                entry_print("LINKS THAT GO BACK TO OLD SITE!")
                entry_print("Old URL: " + old_url)
                entry_print("New URL: " + new_url)
                detail.write("LINKS THAT GO BACK TO OLD SITE!\n")
                detail.write("Old URL: " + old_url + "\n")
                detail.write("New URL: " + new_url + "\n")
                new_pass1 = False
            entry_print("Bad tag: " + str(tag))
            detail.write("Bad tag: " + str(tag) + "\n")
    if not new_pass1:
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")

    # check for non-friendly urls
    new_pass2 = True
    for tag in new_tags:
        href = tag['href']
        if not href:
            continue
        if href.find("televox.west.com") != -1:
            if new_pass2:
                entry_print("***********************************************")
                entry_print("NON-FRIENDLY URL FOUND!")
                entry_print("New URL: " + new_url)
                detail.write("NON-FRIENDLY URL FOUND!\n")
                detail.write("New URL: " + new_url + "\n")
                new_pass2 = False
            entry_print("Bad tag: " + str(tag))
            detail.write("Bad tag: " + str(tag) + "\n")
    if not new_pass2:
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")

    # remove file links
    for tag in old_tags:
        url = tag.get('href')
        if re.search("jpg|png|pdf|mp4", url):
            old_tags.remove(tag)
    for tag in new_tags:
        url = tag.get('href')
        if re.search("jpg|png|pdf|mp4|UserFile", url):
            new_tags.remove(tag)

    bad_tags = []
    if len(old_tags) != len(new_tags):
        # remove 404 pages and file links from the old tags
        for tag in old_tags:
            url = tag.get('href')
            if url is None:
                continue
            if url.startswith("https://"):
                continue
            if url.startswith("tel:") or url.startswith(
                    "mailto:") or url.find("#") != -1:
                continue
            if url.startswith("/"):
                url = "http://" + old_hostname + url
            old_target = get_soup(url)
            old_target_title = get_meta_soup(old_target, url)['title']
            if old_target_title.find("404") != -1 \
                    or re.search("page not found|the resource cannot be found", old_target_title.lower()) \
                    or old_target_title == "none":
                bad_tags.append((str(tag), old_target_title))
                old_tags.remove(tag)

    # check invalid links in new site
    new_invalid_links = []
    for tag in new_tags:
        url = tag.get('href')
        if url is None:
            continue
        if url.startswith("https://"):
            continue
        if url.startswith("tel:") or url.startswith(
                "mailto:") or url.find("#") != -1 or url.startswith("/common"):
            continue
        if url.startswith("/"):
            url = "http://" + new_hostname + url
        if url.find("televox.west.com") != -1:
            new_target = get_soup(url, browser)
        else:
            new_target = get_soup(url)
        new_target_title = get_meta_soup(new_target, url)['title']
        if new_target_title.find("404") != -1 or new_target_title == "Page Not Found" or new_target_title == "none" \
                or new_target_title == "The resource cannot be found.":
            new_invalid_links.append((str(tag), new_target_title))

    if new_invalid_links:
        entry_print("***********************************************")
        entry_print("INVALID LINK FOUND IN NEW SITE!")
        entry_print("New URL: " + new_url)
        detail.write("INVALID LINK FOUND IN NEW SITE!\n")
        detail.write("New URL: " + new_url + "\n")
        ind = 0
        for tag, target in new_invalid_links:
            ind += 1
            entry_print("Bad tag" + str(ind) + ": " + tag)
            entry_print("Target title: " + target)
            detail.write("Bad tag" + str(ind) + ": " + tag + "\n")
            detail.write("Target title: " + target + "\n")
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")

    # check that number of links match if not, return
    if len(new_tags) != len(old_tags):
        entry_print("***********************************************")
        entry_print(
            "NUMBER OF LINKS DIFFERENT OR 404 LINK EXISTS IN NEW PAGE!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old links: " + str(len(old_tags)))
        entry_print("Number of new links: " + str(len(new_tags)))
        entry_print("Old tags: " + str(old_tags))
        entry_print("New tags: " + str(new_tags))
        if bad_tags:
            entry_print("404 tags in old site (removed):")
            for ind in range(len(bad_tags)):
                entry_print("Tag" + str(ind + 1) + ": " + bad_tags[ind][0])
                entry_print("Target title: " + bad_tags[ind][1])
        entry_print("***********************************************")
        detail.write(
            "NUMBER OF LINKS DIFFERENT OR 404 LINK EXISTS IN NEW PAGE!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old links: " + str(len(old_tags)) + "\n")
        detail.write("Number of new links: " + str(len(new_tags)) + "\n")
        if bad_tags:
            detail.write("404 tag(s) in old site (removed):\n")
            for ind in range(len(bad_tags)):
                detail.write("Tag" + str(ind + 1) + ": " + bad_tags[ind][0] +
                             "\n")
                detail.write("Target title: " + bad_tags[ind][1] + "\n")
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")
        detail.close()
        return False

    # check that new and old links match
    new_pass3 = True
    count = 0
    for ind in range(len(new_tags)):
        old_link = old_tags[ind]['href'].replace("\\", "/").strip()
        new_link = new_tags[ind]['href'].replace("\\", "/").strip()
        if old_link == new_link:
            continue

        # take out the duplication part for old_link
        if old_link.find("#") != -1:
            old_ind = old_link.find("#")
            old_link = old_link[old_ind:]
        if new_link.find("#") != -1:
            new_ind = new_link.find("#")
            new_link = new_link[new_ind:]

        temp = old_link.split("/")
        if len(temp) > 2:
            if temp[-1] == temp[-2]:
                old_link = "/".join(temp[:-1])
        if urlparse(old_link).path == urlparse(new_link).path:
            continue

        if old_link.startswith("/"):
            old_link = "http://" + old_hostname + old_link
        # if the old link points to the homepage, then set it as "/"
        if old_link.endswith("/home") or old_link.endswith("/main"):
            old_link = "/"
        if new_link == "/home" or new_link == "/main":
            new_link = "/"
        if new_link != "/" and new_link.endswith("/"):
            new_link = new_link[:-1]
        if old_link != "/" and old_link.endswith("/"):
            old_link = old_link[:-1]

        if old_link != new_link and not new_link.startswith("/common"):
            if old_link.find("#") != -1 or new_link.find("#") != -1:
                count += 1
                if new_pass3:
                    entry_print(
                        "***********************************************")
                    entry_print("LINKS THAT DO NOT MATCH!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    detail.write("LINKS THAT DO NOT MATCH!\n")
                    detail.write("Old URL: " + old_url + "\n")
                    detail.write("New URL: " + new_url + "\n")
                    new_pass3 = False
                entry_print("Old link" + str(count) + ": " + old_link)
                entry_print("New link" + str(count) + ": " + new_link)
                entry_print("Old tag" + str(count) + ": " + str(old_tags[ind]))
                entry_print("New tag" + str(count) + ": " + str(new_tags[ind]))
                detail.write("Old tag" + str(count) + ": " +
                             str(old_tags[ind]) + "\n")
                detail.write("New tag" + str(count) + ": " +
                             str(new_tags[ind]) + "\n")
                continue

            if old_link.startswith("/"):
                old_link = "http://" + old_hostname + old_link.strip()
            if new_link.startswith("/"):
                new_link = "http://" + new_hostname + new_link.strip()

            old_target = get_soup(old_link)
            new_target = get_soup(new_link, browser=browser)
            old_target_title = replace_special(
                get_meta_soup(old_target, old_link)['title'])
            new_target_title = replace_special(
                get_meta_soup(new_target, new_link)['title'])

            if new_target_title.endswith("..."):
                new_target_title = new_target_title[:-3]
                old_target_title = old_target_title[:len(new_target_title)]

            if old_target_title != new_target_title:
                count += 1
                if new_pass3:
                    entry_print(
                        "***********************************************")
                    entry_print("LINKS THAT DO NOT MATCH!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    detail.write("LINKS THAT DO NOT MATCH!\n")
                    detail.write("Old URL: " + old_url + "\n")
                    detail.write("New URL: " + new_url + "\n")
                    new_pass3 = False
                entry_print("Old link" + str(count) + ": " + old_link)
                entry_print("New link" + str(count) + ": " + new_link)
                entry_print("Old target" + str(count) + ": " +
                            old_target_title)
                entry_print("New target" + str(count) + ": " +
                            new_target_title)
                entry_print("Old tag" + str(count) + ": " + str(old_tags[ind]))
                entry_print("New tag" + str(count) + ": " + str(new_tags[ind]))
                detail.write("Old tag" + str(count) + ": " +
                             str(old_tags[ind]) + "\n")
                detail.write("New tag" + str(count) + ": " +
                             str(new_tags[ind]) + "\n")
    if not new_pass3:
        detail.write("-----------------------------------------------\n")
        entry_print("***********************************************")

    detail.close()
    return new_pass1 and new_pass2 and new_pass3

コード例 #9

ファイルを表示

ファイル: QA_compare_form.py プロジェクト: k47ma/Scraping-Interface

def compare_form_soup(old_soup, new_soup, old_url, new_url):
    detail = open("result\\form_detail.txt", 'a')
    require_pass = True
    title_pass = True
    entry_pass = True
    auth_pass = True
    old_container = old_soup.find('div', class_="form-container")
    new_container = new_soup.find('div', class_="secureform")

    if not old_container and not new_container:
        detail.close()
        return True
    elif not old_container and new_container:
        record_error(old_url, "form container")
        detail.close()
        return False
    elif old_container and not new_container:
        record_error(new_url, "form container")
        detail.close()
        return False

    # check the "required field" text in new form
    if not new_container.find(text=re.compile("required field")):
        entry_print("***********************************************")
        entry_print("FORM MISSING '* REQUIRED FIELD' TITLE!")
        entry_print("New URL: " + new_url)
        entry_print("***********************************************")
        detail.write("FORM MISSING '* REQUIRED FIELD' TITLE!\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("-----------------------------------------------\n")
        require_pass = False

        # find all the entry names and choices from old page
    if old_container:
        for (name, kwargs) in settings["OLD_FORM_ENTRY_IGNORE"]:
            for s in old_container.find_all(name, **kwargs):
                s.extract()
    old_entries = [replace_special(text) for text in old_container.stripped_strings]

    # find all the entry names and choices from new page
    if new_container:
        for (name, kwargs) in settings["NEW_FORM_ENTRY_IGNORE"]:
            for s in new_container.find_all(name, **kwargs):
                s.extract()
    new_entries = [replace_special(text) for text in new_container.stripped_strings]

    # compare entry names
    if len(old_entries) != len(new_entries):
        entry_print("***********************************************")
        entry_print("NUMBER OF FORM ENTRIES DIFFERENT!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old entries: " + str(len(old_entries)))
        entry_print("Number of new entries: " + str(len(new_entries)))
        entry_print("Old entries: " + str(old_entries))
        entry_print("New entries: " + str(new_entries))
        detail.write("NUMBER OF FORM ENTRIES DIFFERENT!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old entries: " + str(len(old_entries)) + "\n")
        detail.write("Number of new entries: " + str(len(new_entries)) + "\n")
        detail.write("Old entries: " + str(old_entries) + "\n")
        detail.write("New entries: " + str(new_entries) + "\n")
        entry_pass = False

        # try to track down the issue
        for ind in range(min(len(old_entries), len(new_entries))):
            if old_entries[ind] != new_entries[ind] and old_entries[ind].upper() != new_entries[ind].upper():
                entry_print("FIRST DIFFERENCE:")
                entry_print("Old entry name: " + old_entries[ind])
                entry_print("New entry name: " + new_entries[ind])
                detail.write("FIRST DIFFERENCE:\n")
                detail.write("Old entry name: " + old_entries[ind] + "\n")
                detail.write("New entry name: " + new_entries[ind] + "\n")
                break
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")
    else:
        # compare each entry
        count = 0
        old_diff = []
        new_diff = []
        new_pass = True

        for ind in range(len(old_entries)):
            old_entry = old_entries[ind]
            new_entry = new_entries[ind]
            if old_entry != new_entry and old_entry.upper() != new_entry.upper():
                old_diff.append(old_entry)
                new_diff.append(new_entry)
        old_diff_dup = [i.upper() for i in old_diff]
        new_diff_dup = [i.upper() for i in new_diff]
        old_diff_dup.sort()
        new_diff_dup.sort()

        for old_entry, new_entry in zip(old_diff_dup, new_diff_dup):
            if old_entry != new_entry:
                new_pass = False
                break

        if not new_pass:
            for old_entry, new_entry in zip(old_diff, new_diff):
                count += 1
                if entry_pass:
                    entry_print("***********************************************")
                    entry_print("FORM ENTRIES DO NOT MATCH!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    detail.write("FORM ENTRIES DO NOT MATCH!\n")
                    detail.write("Old URL: " + old_url + "\n")
                    detail.write("New URL: " + new_url + "\n")
                    entry_pass = False
                entry_print("Old entry name" + str(count) + ": " + old_entry)
                entry_print("New entry name" + str(count) + ": " + new_entry)
                detail.write("Old entry name" + str(count) + ": " + old_entry + "\n")
                detail.write("New entry name" + str(count) + ": " + new_entry + "\n")
            entry_print("***********************************************")
            detail.write("-----------------------------------------------\n")

    detail.close()
    return require_pass and title_pass and entry_pass and auth_pass