Example #1
0
class ParserBS(AbstractParser):
    """
    The custom parser over BeautifulSoup
    """
    def __init__(self, html_raw: str, parser_bs_type: str = "html.parser"):
        self.html_parsed = BeautifulSoup(html_raw, parser_bs_type)

    @property
    def html_raw(self) -> str:
        return self.html_parsed.__str__()

    @cached_property
    def title(self) -> str:
        title = self.html_parsed.find("title")
        return title and title.text or ""

    @cached_property
    def anchor_nodes(self) -> Iterable[ResultSet]:
        return self.html_parsed.find_all("a", attrs={"href": True})

    def get_related_anchors_href(self) -> Iterable[str]:
        collection: Set[str] = set()

        for node in self.anchor_nodes:
            href: str = node.attrs.get("href")
            if not ParserBS._is_href_url_related(href):
                continue
            collection.add(href)

        return collection

    def __repr__(self):
        return self.html_parsed.__repr__()
Example #2
0
def change_encode():
    files = get_articles()
    for item in files:
        with open(path+"/"+item+".html", "r") as f:
            html = f.read()
            soup = BeautifulSoup(html)
            s = """
            <head>
            <meta http-equiv="content-type" content="text/html; charset=utf-8">
            <meta charset="UTF-8">
            </head>
            """
            tag_head = soup.new_tag("head")
            tag_meta = soup.new_tag("meta")
            tag_meta["http-equiv"] = "content-type"
            tag_meta["content"] = "text/html; charset=utf-8"
            soup.html.body.insert_before(tag_head)
            soup.html.head.append(tag_meta)
        with open(path+"/"+item+".html", "w") as f:
            f.write(soup.__repr__())
Example #3
0
def get_page_content():
    # goes to website and creates file based off of HTML tile
    urlinput = input("What URL do you want to pull from?")
    filename = ''.join(
        random.choice(string.ascii_uppercase + string.digits)
        for _ in range(1, 10))
    filename += ".html"
    page = ssn.get(urlinput)
    soup = BeautifulSoup(page.content, 'html.parser')
    newfile = open(root_dir + root_folder + filename, "w+")

    # function to add content to the beginning of the file
    def line_prepender(file, line):
        with open(root_dir + root_folder + file, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(line.rstrip('\r\n') + '\n' + content)

    pagecontent = soup.find("div", {"id": "main-content"})
    pagecontent = BeautifulSoup(pagecontent.__repr__(), 'html.parser')

    # saves pictures on the page to seperate files
    for item in pagecontent.find_all("img",
                                     {"class": "confluence-embedded-image"}):
        list_att = list(item.attrs.keys())
        imagesrc = item['src']
        imageURL = "https://opensource.ncsa.illinois.edu" + imagesrc
        r = ssn.get(imageURL, allow_redirects=True)
        picturesfilename = str(item['data-linked-resource-default-alias'])
        open(root_dir + "/images/" + picturesfilename, 'wb').write(r.content)
        for att in list_att:
            if att not in ['src', 'width', 'height', 'scale']:
                del item[att]
        item['src'] = '/static/images/' + picturesfilename

    # delete classes
    for tag in pagecontent():
        del tag["class"]

    newfile.write(pagecontent.prettify())

    # add page identifiers to end of file
    header = input("What should the heading be? (Title of the des-card)")
    pageid = input(
        "What should the dom-module id be? (des-home, des-data, etc.)")
    pageclass = input(
        "What Polymer class should this is labeled as (desHome)?")
    newfile = open(root_dir + root_folder + filename, 'a')
    endtext = """
    </div>
    </des-card>
    </template>
    <script>
    class {pageclass} extends Polymer.Element {{
      static get is() {{ return '{pageid}'; }}
       }}
     window.customElements.define({pageclass}.is,{pageclass});
     </script>
     </dom-module>
    """.format(pageclass=pageclass, pageid=pageid)
    newfile.write(endtext)

    # renames file
    # newfilename = str(soup.title.text)[:-15] + ".html"
    # newfilename = re.sub('[/]', '-', newfilename)

    newfilename = pageid + ".html"
    os.rename(root_dir + root_folder + filename,
              root_dir + root_folder + newfilename)

    # add page identifiers to beginning of file
    initext = """\
    <dom-module id='{pageid}'>
    <template>
    <style include='shared-styles'>
    :host {{
       display: block;
       padding: 10px;
       }}
    </style>
    <des-card heading="{header}">
    <div class=card-content>
    """.format(pageid=pageid, header=header)
    line_prepender(newfilename, initext)

    # adds more pages
    def addanotherpage():
        answer = input("Do you have another page to add? Y/N")
        if answer == "yes" or answer == "Yes" or answer == "y":
            get_page_content()
            return
        if answer == "no" or answer == "No" or answer == "n":
            print(
                "Please note that locations within imported pages for images and"
                "other files will need to be changed to reflect the correct corresponding"
                "location on the user's computer "
                "(unless the file isn't imported from a local location)")
            sleep(2)
            print("Exiting program")
            ssn.get(
                "https://opensource.ncsa.illinois.edu/confluence/login.action?logout=true"
            )
            return
        else:
            print("Please enter Yes or No")
            addanotherpage()

    addanotherpage()