class ParserBS(AbstractParser): """ The custom parser over BeautifulSoup """ def __init__(self, html_raw: str, parser_bs_type: str = "html.parser"): self.html_parsed = BeautifulSoup(html_raw, parser_bs_type) @property def html_raw(self) -> str: return self.html_parsed.__str__() @cached_property def title(self) -> str: title = self.html_parsed.find("title") return title and title.text or "" @cached_property def anchor_nodes(self) -> Iterable[ResultSet]: return self.html_parsed.find_all("a", attrs={"href": True}) def get_related_anchors_href(self) -> Iterable[str]: collection: Set[str] = set() for node in self.anchor_nodes: href: str = node.attrs.get("href") if not ParserBS._is_href_url_related(href): continue collection.add(href) return collection def __repr__(self): return self.html_parsed.__repr__()
def change_encode(): files = get_articles() for item in files: with open(path+"/"+item+".html", "r") as f: html = f.read() soup = BeautifulSoup(html) s = """ <head> <meta http-equiv="content-type" content="text/html; charset=utf-8"> <meta charset="UTF-8"> </head> """ tag_head = soup.new_tag("head") tag_meta = soup.new_tag("meta") tag_meta["http-equiv"] = "content-type" tag_meta["content"] = "text/html; charset=utf-8" soup.html.body.insert_before(tag_head) soup.html.head.append(tag_meta) with open(path+"/"+item+".html", "w") as f: f.write(soup.__repr__())
def get_page_content(): # goes to website and creates file based off of HTML tile urlinput = input("What URL do you want to pull from?") filename = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(1, 10)) filename += ".html" page = ssn.get(urlinput) soup = BeautifulSoup(page.content, 'html.parser') newfile = open(root_dir + root_folder + filename, "w+") # function to add content to the beginning of the file def line_prepender(file, line): with open(root_dir + root_folder + file, 'r+') as f: content = f.read() f.seek(0, 0) f.write(line.rstrip('\r\n') + '\n' + content) pagecontent = soup.find("div", {"id": "main-content"}) pagecontent = BeautifulSoup(pagecontent.__repr__(), 'html.parser') # saves pictures on the page to seperate files for item in pagecontent.find_all("img", {"class": "confluence-embedded-image"}): list_att = list(item.attrs.keys()) imagesrc = item['src'] imageURL = "https://opensource.ncsa.illinois.edu" + imagesrc r = ssn.get(imageURL, allow_redirects=True) picturesfilename = str(item['data-linked-resource-default-alias']) open(root_dir + "/images/" + picturesfilename, 'wb').write(r.content) for att in list_att: if att not in ['src', 'width', 'height', 'scale']: del item[att] item['src'] = '/static/images/' + picturesfilename # delete classes for tag in pagecontent(): del tag["class"] newfile.write(pagecontent.prettify()) # add page identifiers to end of file header = input("What should the heading be? (Title of the des-card)") pageid = input( "What should the dom-module id be? (des-home, des-data, etc.)") pageclass = input( "What Polymer class should this is labeled as (desHome)?") newfile = open(root_dir + root_folder + filename, 'a') endtext = """ </div> </des-card> </template> <script> class {pageclass} extends Polymer.Element {{ static get is() {{ return '{pageid}'; }} }} window.customElements.define({pageclass}.is,{pageclass}); </script> </dom-module> """.format(pageclass=pageclass, pageid=pageid) newfile.write(endtext) # renames file # newfilename = str(soup.title.text)[:-15] + ".html" # newfilename = re.sub('[/]', '-', newfilename) newfilename = pageid + ".html" os.rename(root_dir + root_folder + filename, root_dir + root_folder + newfilename) # add page identifiers to beginning of file initext = """\ <dom-module id='{pageid}'> <template> <style include='shared-styles'> :host {{ display: block; padding: 10px; }} </style> <des-card heading="{header}"> <div class=card-content> """.format(pageid=pageid, header=header) line_prepender(newfilename, initext) # adds more pages def addanotherpage(): answer = input("Do you have another page to add? Y/N") if answer == "yes" or answer == "Yes" or answer == "y": get_page_content() return if answer == "no" or answer == "No" or answer == "n": print( "Please note that locations within imported pages for images and" "other files will need to be changed to reflect the correct corresponding" "location on the user's computer " "(unless the file isn't imported from a local location)") sleep(2) print("Exiting program") ssn.get( "https://opensource.ncsa.illinois.edu/confluence/login.action?logout=true" ) return else: print("Please enter Yes or No") addanotherpage() addanotherpage()