コード例 #1
0
ファイル: main.py プロジェクト: wkias/zsxq-crawler
 def parse_article(self, url):
     html = requests.get(url=url, headers=self.headers).text
     html = bs(html, features="lxml")
     html = html.find("div", class_="content")
     md = Tomd(str(html)).markdown
     md = md.replace("\n", "")
     return md
コード例 #2
0
def build_file(name):
    path = "posts/" + name + "/index.html"

    f = codecs.open(path, 'r')
    html = f.read()

    soup = BeautifulSoup(html, features="html5lib")

    # Save title
    md = "---"
    md += "\ntitle: " + soup.find("h1").get_text().strip()
    md += "\ndate: " + soup.find("time").get_text().replace("-", "/")
    md += "\ntags: " + soup.find("a", "tag-link").get_text()
    md += "\n---\n"

    content = soup.find(itemprop="articleBody")

    for child in content.children:
        if str(child)[:4] == '<img':
            end_point = child['src'].rfind('/') + 1
            file_name = child['src'][end_point:]
            text = "{% asset_img " + file_name + " %}"
            md += "\n" + text + "\n"
        elif str(child)[:2] == '<h':
            num_pounds = "#" * int(str(child)[2])
            md += "\n" + num_pounds + " " + child.get_text() + "\n"
        elif str(child)[:3] == '<if':
            md += "\n" + str(child) + "\n"
        elif str(child)[:24] == '<figure class="highlight':
            code_sample = str(child)

            code_type = code_sample[25:code_sample.find('"', 24)]

            temp_md = Tomd(str(child)).markdown
            temp_md = temp_md[temp_md.find('<td class="code"'):]
            temp_md = BeautifulSoup(temp_md, features="html5lib").find("pre")

            pre_md = str(temp_md)
            pre_md = pre_md[5:-6]

            temp_md = "\n``` "
            temp_md += code_type + '\n'
            for i, char in enumerate(pre_md):
                if pre_md[i:i + 5] == '<br/>':
                    temp_md += '\n'
                    temp_md += char
                else:
                    temp_md += char
            temp_md += '```'

            md += temp_md.replace('<br/>', '')
        else:
            md += Tomd(str(child)).markdown

    with open('posts/' + name + '.md', 'w') as file:
        file.write(md)
コード例 #3
0
def soup_snapshot_2md(soup):
    # 待写入字符串
    text = str()

    # 查找所有有关的节点
    tags = soup.find_all(
        class_="page-section page-section--first article-body")[0]
    body_tag = soup.find_all("div", class_="end-with-block")[0]
    author_tag = soup.find_all("div", class_="attribution pb-3")[0]

    # 快照图片
    img_tag = soup.find_all(class_="article-head__image-container")[0]
    img_url = f"![]({'https://www.minecraft.net'}{img_tag.img['src']})"
    text = img_url + "\n"

    # 标题、副标题
    # head = 'Minecraft Snapshot 20w21a'
    head = tags.h1.get_text(strip=True)
    lead = tags.p.get_text(strip=True)
    text += "# " + head + "\n"
    text += "## " + lead + "\n"

    # 获得版本
    if "snapshot" in head.lower():
        head_name = head[head.rfind(" ") + 1:]
        dirname = "./snapshots/" + head_name
        filename = head_name
        print(dirname)
    elif "pre-release" in head.lower():
        head_name = head[head.find(" ") + 1:]
        dirname = "./pre_release/" + head_name
        filename = head_name
        print(dirname)
    elif "candidate" in head.lower():
        head_name = head[head.find(" ") + 1:]
        dirname = "./candidate/" + head_name
        filename = head_name
        print(dirname)
    elif "edition" or "released" in head.lower():
        head_name = head
        dirname = "./edition/" + head_name
        filename = head_name
        print(dirname)
    elif "update" in head.lower():
        head_name = head[head.find(" ") + 1:]
        dirname = "./edition/" + head_name
        filename = head_name
        print(dirname)

    # 文章主体转换为markdown
    body_html = str()
    output = str()
    for child in body_tag.children:
        body_html += str(child)
    output = Tomd(body_html).markdown
    output = output.replace("<br/>", "")
    output = output.replace("&lt;", "<")
    output = output.replace("&gt;", ">")
    output = output.replace("<li>", "- ")
    output = output.replace("</li>\n", "")
    text += output

    # 文章作者
    author = author_tag.dl.get_text()
    try:
        author_img_url = f"![]({'https://www.minecraft.net'}{author_tag.img['src']})"
    except TypeError:
        author_img_url = f"![]()"
    pubdate = author_tag.find(class_="pubDate").attrs['data-value'][:10]
    text += (author.rstrip("\n") + "\n" + pubdate + "\n" + author_img_url +
             "\n")
    text = text.replace("Written By", "**Written By**")
    text = text.replace("Published", "**Published**")

    # 创建目录
    mkdir(dirname)

    # 写入文件
    writedoc(text, dirname, filename)
コード例 #4
0
def build_file(name):
    path = "posts/" + name + "/index.html"

    # Added "encoding" parameter, else it would return null
    f = codecs.open(path, "r", encoding="utf-8")

    html = f.read()

    soup = BeautifulSoup(html, features="html5lib")

    # Save title
    md = "---"
    md += "\ntitle: " + soup.find("h1").get_text().strip()
    md += "\ndate: " + soup.find("time").get_text().replace("-", "/")

    # This line caused me some trouble, because in my html files
    # the tags were not displayed.
    #  md += "\ntags: " + soup.find("a", "tag-link").get_text()

    md += "\n---\n"

    content = soup.find(itemprop="articleBody")

    for child in content.children:
        if str(child)[:4] == "<img":
            end_point = child["src"].rfind("/") + 1
            file_name = child["src"][end_point:]
            text = "{% asset_img " + file_name + " %}"
            md += "\n" + text + "\n"
        elif str(child)[:2] == "<h":
            num_pounds = "#" * int(str(child)[2])
            md += "\n" + num_pounds + " " + child.get_text() + "\n"
        elif str(child)[:3] == "<if":
            md += "\n" + str(child) + "\n"
        elif str(child)[:24] == '<figure class="highlight':
            code_sample = str(child)

            code_type = code_sample[25:code_sample.find('"', 24)]

            temp_md = Tomd(str(child)).markdown
            temp_md = temp_md[temp_md.find('<td class="code"'):]
            temp_md = BeautifulSoup(temp_md, features="html5lib").find("pre")

            pre_md = str(temp_md)
            pre_md = pre_md[5:-6]

            temp_md = "\n``` "
            temp_md += code_type + "\n"
            for i, char in enumerate(pre_md):
                if pre_md[i:i + 5] == "<br/>":
                    temp_md += "\n"
                    temp_md += char
                else:
                    temp_md += char
            temp_md += "```"

            md += temp_md.replace("<br/>", "")
        else:
            md += Tomd(str(child)).markdown

    # Added "encoding" parameter, else it would throw a UnicodeEncodeError.
    with open("posts/" + name + ".md", "w", encoding="utf-8") as file:
        file.write(md)