def parse_article(self, url): html = requests.get(url=url, headers=self.headers).text html = bs(html, features="lxml") html = html.find("div", class_="content") md = Tomd(str(html)).markdown md = md.replace("\n", "") return md
def build_file(name): path = "posts/" + name + "/index.html" f = codecs.open(path, 'r') html = f.read() soup = BeautifulSoup(html, features="html5lib") # Save title md = "---" md += "\ntitle: " + soup.find("h1").get_text().strip() md += "\ndate: " + soup.find("time").get_text().replace("-", "/") md += "\ntags: " + soup.find("a", "tag-link").get_text() md += "\n---\n" content = soup.find(itemprop="articleBody") for child in content.children: if str(child)[:4] == '<img': end_point = child['src'].rfind('/') + 1 file_name = child['src'][end_point:] text = "{% asset_img " + file_name + " %}" md += "\n" + text + "\n" elif str(child)[:2] == '<h': num_pounds = "#" * int(str(child)[2]) md += "\n" + num_pounds + " " + child.get_text() + "\n" elif str(child)[:3] == '<if': md += "\n" + str(child) + "\n" elif str(child)[:24] == '<figure class="highlight': code_sample = str(child) code_type = code_sample[25:code_sample.find('"', 24)] temp_md = Tomd(str(child)).markdown temp_md = temp_md[temp_md.find('<td class="code"'):] temp_md = BeautifulSoup(temp_md, features="html5lib").find("pre") pre_md = str(temp_md) pre_md = pre_md[5:-6] temp_md = "\n``` " temp_md += code_type + '\n' for i, char in enumerate(pre_md): if pre_md[i:i + 5] == '<br/>': temp_md += '\n' temp_md += char else: temp_md += char temp_md += '```' md += temp_md.replace('<br/>', '') else: md += Tomd(str(child)).markdown with open('posts/' + name + '.md', 'w') as file: file.write(md)
def soup_snapshot_2md(soup): # 待写入字符串 text = str() # 查找所有有关的节点 tags = soup.find_all( class_="page-section page-section--first article-body")[0] body_tag = soup.find_all("div", class_="end-with-block")[0] author_tag = soup.find_all("div", class_="attribution pb-3")[0] # 快照图片 img_tag = soup.find_all(class_="article-head__image-container")[0] img_url = f"![]({'https://www.minecraft.net'}{img_tag.img['src']})" text = img_url + "\n" # 标题、副标题 # head = 'Minecraft Snapshot 20w21a' head = tags.h1.get_text(strip=True) lead = tags.p.get_text(strip=True) text += "# " + head + "\n" text += "## " + lead + "\n" # 获得版本 if "snapshot" in head.lower(): head_name = head[head.rfind(" ") + 1:] dirname = "./snapshots/" + head_name filename = head_name print(dirname) elif "pre-release" in head.lower(): head_name = head[head.find(" ") + 1:] dirname = "./pre_release/" + head_name filename = head_name print(dirname) elif "candidate" in head.lower(): head_name = head[head.find(" ") + 1:] dirname = "./candidate/" + head_name filename = head_name print(dirname) elif "edition" or "released" in head.lower(): head_name = head dirname = "./edition/" + head_name filename = head_name print(dirname) elif "update" in head.lower(): head_name = head[head.find(" ") + 1:] dirname = "./edition/" + head_name filename = head_name print(dirname) # 文章主体转换为markdown body_html = str() output = str() for child in body_tag.children: body_html += str(child) output = Tomd(body_html).markdown output = output.replace("<br/>", "") output = output.replace("<", "<") output = output.replace(">", ">") output = output.replace("<li>", "- ") output = output.replace("</li>\n", "") text += output # 文章作者 author = author_tag.dl.get_text() try: author_img_url = f"![]({'https://www.minecraft.net'}{author_tag.img['src']})" except TypeError: author_img_url = f"![]()" pubdate = author_tag.find(class_="pubDate").attrs['data-value'][:10] text += (author.rstrip("\n") + "\n" + pubdate + "\n" + author_img_url + "\n") text = text.replace("Written By", "**Written By**") text = text.replace("Published", "**Published**") # 创建目录 mkdir(dirname) # 写入文件 writedoc(text, dirname, filename)
def build_file(name): path = "posts/" + name + "/index.html" # Added "encoding" parameter, else it would return null f = codecs.open(path, "r", encoding="utf-8") html = f.read() soup = BeautifulSoup(html, features="html5lib") # Save title md = "---" md += "\ntitle: " + soup.find("h1").get_text().strip() md += "\ndate: " + soup.find("time").get_text().replace("-", "/") # This line caused me some trouble, because in my html files # the tags were not displayed. # md += "\ntags: " + soup.find("a", "tag-link").get_text() md += "\n---\n" content = soup.find(itemprop="articleBody") for child in content.children: if str(child)[:4] == "<img": end_point = child["src"].rfind("/") + 1 file_name = child["src"][end_point:] text = "{% asset_img " + file_name + " %}" md += "\n" + text + "\n" elif str(child)[:2] == "<h": num_pounds = "#" * int(str(child)[2]) md += "\n" + num_pounds + " " + child.get_text() + "\n" elif str(child)[:3] == "<if": md += "\n" + str(child) + "\n" elif str(child)[:24] == '<figure class="highlight': code_sample = str(child) code_type = code_sample[25:code_sample.find('"', 24)] temp_md = Tomd(str(child)).markdown temp_md = temp_md[temp_md.find('<td class="code"'):] temp_md = BeautifulSoup(temp_md, features="html5lib").find("pre") pre_md = str(temp_md) pre_md = pre_md[5:-6] temp_md = "\n``` " temp_md += code_type + "\n" for i, char in enumerate(pre_md): if pre_md[i:i + 5] == "<br/>": temp_md += "\n" temp_md += char else: temp_md += char temp_md += "```" md += temp_md.replace("<br/>", "") else: md += Tomd(str(child)).markdown # Added "encoding" parameter, else it would throw a UnicodeEncodeError. with open("posts/" + name + ".md", "w", encoding="utf-8") as file: file.write(md)