コード例 #1
0
ファイル: main.py プロジェクト: wkias/zsxq-crawler
 def parse_article(self, url):
     html = requests.get(url=url, headers=self.headers).text
     html = bs(html, features="lxml")
     html = html.find("div", class_="content")
     md = Tomd(str(html)).markdown
     md = md.replace("\n", "")
     return md
コード例 #2
0
    def parse_news(self, response):
        log.msg("Start to parse news " + response.url, level=log.INFO)
        item = SpiderNewsAllItem()
        day = title = _type = keywords = url = article = ''
        url = response.url
        day = response.meta['day']
        title = response.meta['title']
        _type = response.meta['_type']
        response = response.body
        soup = BeautifulSoup(response)
#        try:
#            items_keywords = soup.find(class_='ar_keywords').find_all('a')
#            for i in range(0, len(items_keywords)):
#                keywords += items_keywords[i].text.strip() + ' '
#        except:
#            log.msg("News " + title + " dont has keywords!", level=log.INFO)
        

        
        try:
        ##################################
        # 分情况获取储存新闻内容的标签
                # "码云推荐",获取项目简介(通常是 README.md 文档内容)
            if re.search("translate",url):
                article = soup.find_all("div",class_ = "translate-content")
                markdown = "".join(str(article))
                markdown = Tomd(markdown).markdown
                article = [tag.text.strip() for tag in article]
                article = ''.join(article)
            else:
                if re.match("https://gitee.com",url):
                    article = soup.find("div",class_="file_content markdown-body")# CSS选择器:#git-readme > div > div.file_content.markdown-body
                # "码云周刊"
                elif re.match("https://blog.gitee.com",url):
                    article = soup.find("div",class_="entry-content")
                elif re.search("translate",url):
                    article = soup.find_all("div",class_ = "translate-content")
                # 其他常见页面
                elif soup.find("div",class_= ["content","box-aw main"]):
                    article = soup.find("div",class_= ["content","box-aw main"])
                else:
                    article = soup.find("section",class_= ["wrap cke_editable cke_editable_themed cke_contents_ltr cke_show_borders clearfix"])
                    
                if article and not article.find("div",class_="ad-wrap")==None:
                    article.find("div",class_="ad-wrap").extract()
                
                markdown = markdown = Tomd(str(article)).markdown
                article = article.text.strip() #提取标签文本
        except:
            log.msg("News " + title + " dont has article!", level=log.INFO)
        item['title'] = title
        item['day'] = day
        item['_type'] = _type
        item['url'] = url
        item['keywords'] = keywords
        item['article'] = article
        item['site'] = '开源中国'
        item['markdown'] = markdown
        return item
コード例 #3
0
def Combinations(n=1, k=1, tablePrinted=False):
    var = [1, 0]
    VARS = {}
    numOfVars = n
    numOfPlaces = k

    condition = str(numOfPlaces) + "==(0"
    for f in range(numOfVars):
        VARS["_V" + str(f + 1)] = var
        condition += "+" + "_V" + str(f + 1)
    condition += ")"
    print("""
Number of symbols to allocate:\t\t""" + str(numOfVars) + """

Number of locations to be place in:\t""" + str(numOfPlaces) + """

ValidationCondition:\t""" + condition + """
			""")

    RES = [{
        "name": "_ValidCombination",
        "value": condition
    }]

    Evaluation = chipEvaluator(VARS, RES)

    from tomd import Tomd
    Table = Tomd(Evaluation).markdown.replace(cSpace, " ")
    ResTable = Table.split("\n")
    num = len(ResTable[3].split()) - 2
    tot = 0
    elements = ResTable[1].replace("|", "").split()[:-1]
    elements = [elem[1:] for elem in elements]
    print("Elements:\t", listToSet(elements))
    print("\nCombinations:\n{")
    for f in ResTable:
        row = [n for n in f.split()]
        if (len(row) > num and str(row[num]) == "1"):
            o = f.replace("|", "").split()
            print("\t{", end=" ")
            for g in range(len(o) - 1):
                if (TF(o[g]) == 1):
                    print(elements[g], end=" ")
            print("}")
            tot += 1

    print(("\t" * numOfPlaces) + "}\n\nTotal of valid results: ", tot, "\n")

    if (tablePrinted):
        print("\n", Table)
    return (0)
コード例 #4
0
ファイル: Problem.py プロジェクト: termicoder/termicoder-beta
    def __str__(self):
        ret = self.text
        ret = ret.replace('_', '')
        keymap = {
            r'\le': r"<=",
            r'\cdot': r".",
            r'\ge': r">=",
            r'\lt': r"<",
            r'$': r"_",
            r'\dots': r"..."
        }
        for key in keymap:
            value = keymap[key]
            ret = ret.replace(key, value)

        # temprorily convert to html
        mdProcessor = markdown.Markdown()
        myHtmlFragment = str(mdProcessor.convert(ret))
        myHtmlFragment = myHtmlFragment.replace('<code>', r"```")
        myHtmlFragment = myHtmlFragment.replace('</code>', r"```")

        ret = Tomd(myHtmlFragment).markdown

        # calling like this (all CLI options supported, check def main
        ret = mdv.main(ret)
        return ret
コード例 #5
0
ファイル: tgbot.py プロジェクト: Lulzx/til
def post(update, context):
    if update.channel_post:
        if update.channel_post.chat.username == "rememberbox":
            text = update.channel_post.text
            entities = update.channel_post.parse_entities()
            first, *others = text.splitlines()
            length = len(first) + 2
            if first.startswith('👉'):
                path = first[2:]
                first = path.split('/')[-1]  # remove trigger emoji
                filename = path.lower().replace(' ', '-') + ".md"
                rest = parse_bbcode("\n".join(others),
                                    length,
                                    entities,
                                    urled=True)
                html = "<p>" + unescape(render_html(rest)) + "</p>"
                context.bot.send_message(chat_id=691609650, text=html)
                rest = Tomd(html).markdown[length + 1:].replace('<br />', '\n')
                content = "# " + first + "\n\n" + rest
                try:
                    commit = repo.create_file(
                        filename,
                        "automated post from telegram channel",
                        content,
                        branch="master")
                    sha = getattr(commit['commit'], 'sha')
                    url = "https://github.com/Lulzx/til/commit/" + sha
                    context.bot.send_message(
                        chat_id=691609650,
                        text="new addition in TIL repository: {}".format(url))
                except:
                    pass
コード例 #6
0
    def render(self):
        endnotes = ''
        if self.refs:
            self.content = self.refs.process(html.unescape(self.content))
            endnotes = '\n' + self.refs.endnotes

        return Tomd(self.content).markdown + endnotes
コード例 #7
0
ファイル: lop.py プロジェクト: metfar/logicOnPython
def main(args):
    html = 0
    var = [1, 0]
    VARS = {
        "_Armed": var,
        "_Door": var,
        "_Glass": var,
        "_Motion": var
    }
    RES = [{
        "name": "_Alarm",
        "title": "Armed ∧ ( Door ∨ Glass ∨ Motion )",
        "value": "AND(_Armed, OR(_Door,_Glass,_Motion ))"
    }]

    Evaluation = chipEvaluator(VARS, RES)

    if (not html):
        try:
            from tomd import Tomd
            print(Tomd(Evaluation).markdown.replace(cSpace, " "))
        except:
            print(
                "MarkDown format requires Tomd library installed.\nPlease install it doing * pip install tomd *\n"
            )
            exit(1)
    else:
        print(Evaluation)
    return (0)
コード例 #8
0
ファイル: xor.py プロジェクト: metfar/logicOnPython
def main(args):
	html=0;
	var=[1,0];
	VARS={
			"_P"	:	var,
			"_Q"	:	var
			};
	RES=[	
			{
			"name"	:	"(P ∧ Q)",
			"value"	:	"AND(_P,_Q)"
			},
			{
			"name"	:	"(P ⊽ Q)",
			"value"	:	"NOR(_P,_Q)"
			},
	
			{
			"name"	:	"_OUT_MINE=(P ∧ Q) ⊽ ( P ⊽ Q )",
			"value"	:	"NOR(AND(_P,_Q),NOR(_P,_Q))"
			},
			{
			"name"	:	"(P⊼P)",
			"value"	:	"NAND(_P,_P)"
			},
			{
			"name"	:	"((P⊼P)⊼Q)",
			"value"	:	"NAND(NAND(_P,_P),_Q)"
			},
			{
			"name"	:	"(Q⊼Q)",
			"value"	:	"NAND(_Q,_Q)"
			},
			{
			"name"	:	"((Q⊼Q)⊼P)",
			"value"	:	"NAND(NAND(_Q,_Q),_P)"
			},
			{
			"name"	:	"_OUT_OPT=((P⊼P)⊼Q)⊼((Q⊼Q)⊼P)",
			"value"	:	"NAND(NAND(NAND(_P,_P),_Q),NAND(NAND(_Q,_Q),_P))"
			},
			{
			"name"	:	"_XOR=(P⊻Q)",
			"value"	:	"XOR(_P,_Q)"
			}
		];
	
	Evaluation=chipEvaluator(VARS,RES);
	
	if(not html):
		try:
			from tomd import Tomd;
			print(Tomd(Evaluation).markdown.replace(cSpace," "));
		except:
			print("MarkDown format requires Tomd library installed.\nPlease install it doing * pip install tomd *\n");
			exit(1);
	else:
		print(Evaluation);
	return(0);
コード例 #9
0
    def render(self):
        endnotes = ''
        if self.refs:
            self.content = self.refs.process(self.content)
            endnotes = '\n' + self.refs.endnotes

        return self.title + '\n' + self.url + '\n' + Tomd(
            self.content).markdown + endnotes
コード例 #10
0
ファイル: html2md.py プロジェクト: Longxr/python-data-process
def html2md(url):
    # 获取html内容
    html_data = requests.get(url).text

    text = Tomd(html_data).markdown
    # print(text)

    with open("html.md", "w", encoding='utf-8') as text_file:
        text_file.write(text)
コード例 #11
0
def DrupalScrap():
    links = buildlist()
    for item in links:
        get_site = req.Request(item)
        with req.urlopen(get_site) as response:
            site_content = response.read()
            soup = BeautifulSoup(site_content, 'html.parser')
            # Position on the article text box
            for img in soup.find_all('img'):
                img.decompose()
            header_box = soup.find('h1', attrs={'class': 'page-title'})
            article_box = soup.find('div', attrs={'class': 'node__content'})
            article_markdown = Tomd(str(article_box)).markdown
            page_title = Tomd(str(header_box)).markdown
            # permalink_nospace = header_box.replace(" ", "_")
            # permalink = permalink_nospace.lower()
            article = (page_title, article_markdown)
    return article
コード例 #12
0
ファイル: html2md.py プロジェクト: Longxr/python-data-process
def html2mdtest():
    html_data = """<pre>ls -a 列出目录所有文件,包含以.开始的隐藏文件
    ls -A 列出除.及..的其它文件
    ls -r 反序排列
    ls -t 以文件修改时间排序
    ls -S 以文件大小排序
    ls -h 以易读大小显示
    ls -l 除了文件名之外,还将文件的权限、所有者、文件大小等信息详细列出来</pre>"""

    text = Tomd(html_data).markdown
    print(text)
コード例 #13
0
    def _scrap(self, year, day, day_dir):
        """Scrap data from AoC website."""
        url = self._aoc_uri.format(year=year, day=day)
        response = self.request(url, cookies={"session": self._token})

        if not response:
            print(f"Exercise for {day}/12/{year} is not available")
            return

        html = response.text

        self._year_titles.append(self._pattern_title.search(html).groups()[0])
        # Why not group(0) ? Because it returns matching text with pattern,
        # not just the matching text itself

        begin = end = 0

        if day == 1:  # we get the introduction text
            # No BS4 ? Not for this trivial parsing
            begin = html.find('</h2>') + len('</h2>')
            end_str = self._pattern_gl.search(html).group(0)
            end = html.find(end_str) + len(end_str)
            self._year_readme_text = Tomd(html[begin:end]).markdown

        # Get the problem wording
        begin = end if end else html.find('</h2>') + len('</h2>')
        end = html.find('</article>') + len('</article>')
        problem_text = Tomd(html[begin:end]).markdown

        with open(f"{day_dir}/README.md", 'w') as readme:
            readme.write(f"# {self._year_titles[-1]}")
            readme.write(problem_text)

        # Get the input
        response = self.request(f"{url}/input",
                                cookies={"session": self._token})
        inpiout = response.text

        with open(f"{day_dir}/input", 'w') as input_file:
            input_file.write(inpiout)
コード例 #14
0
def htmlToMd(htmlFile, mdFile):
    print(htmlFile)
    with open(htmlFile, 'r', encoding="utf-8") as f:
        contents = f.read()
    f.close()

    #print(contents)
    md = Tomd(contents).markdown
    #print(md)

    with open(mdFile, 'w', encoding="utf-8") as f:
        f.write(md)
    f.close()
コード例 #15
0
def html2md(data, feed_name=None, rooturl=None):
    # data = Tomd(data).markdown
    # data = u'''<!DOCTYPE html><html lang="zh-CN"><head><meta charset="utf-8"></head><body>%s </body></html>''' % data

    soup = BeautifulSoup(data, 'lxml')  #,'html.parser',from_encoding='utf-8'
    head = str(soup.head)
    content = soup.find(id='content')
    if content == None:
        content = soup.body

    clears = [
        'h4',
    ]
    for c in clears:
        All = content.findAll(c)
        for a in All:
            try:
                a.find('a').decompose()
            except Exception as e:
                pass

            try:
                a['class'].clear()
            except Exception as e:
                pass

    dels = ['comments', 'nav-single']
    for tag in dels:
        ts = content.findAll(id=tag)
        for t in ts:
            if t != None:
                t.decompose()

    imgs = content.findAll('img')
    for img in imgs:
        try:
            parsed = urllib.parse.urlparse(img['src'])
            if parsed.scheme == '':
                img['src'] = rooturl + img['src']
        except:
            pass

    filename = './%s.md' % feed_name
    data = u'''<!DOCTYPE html><html lang="zh-CN">%s<body>%s </body></html>''' % (
        head, str(content))
    data = Tomd(data).markdown
    with open(filename, 'w') as file_writer:
        file_writer.write(data)
    file_writer.close()

    return data
コード例 #16
0
ファイル: FetchWeb.py プロジェクト: userwangjf/FetchWeb
    def getAllBody(self,url):

        soup = BeautifulSoup(fetchHtml(url),"html5lib")

        body_class = soup.find_all(class_="article-body")
        body = str(body_class[0])
        #print(str(body).replace('\r\n','--'))

        with open("3.body","w",encoding="utf-8") as f:
            f.write(body.replace("\n","--"))

        with open("3.md","w",encoding="utf-8") as f:
            md = Tomd(body).markdown
            f.write(md)
コード例 #17
0
    def post_entry(self, destination, entry):
        keys = (self.default_settings
                if destination["keys"] == "default" else destination["keys"])
        # find any images from within the summary and add them to the embed
        soup = BeautifulSoup(entry["summary"], "html.parser")
        imgs = soup.find_all("img")
        image = None
        for img in imgs:
            if not image:
                image = img["src"]
            img.decompose()
        entry["summary"] = re.sub(r"<br ?/?>", "\n", str(soup))
        entry["summary"] = re.sub(r"<!--.*-->", "", entry["summary"])
        # remove html formatting from the description
        print(entry["summary"])
        summary = html.unescape(
            Tomd("<p>" +
                 entry["summary"].replace("%22", '"').replace("%3E", ">") +
                 "</p>").markdown.strip())
        entry["summary"] = summary if summary != "" else entry["summary"]
        entry["description"] = entry["summary"]  # alias

        # turns a key format into value string

        def format_key(t):
            v = []
            # for each word in the value of the key if value exists
            for k in keys[t].split() if t in keys and keys[t] else []:
                if k[0] == "$":
                    v.append(entry[k[1:]] if k[1:] in entry else None)
                else:
                    v.append(k)
            # if we're missing any of the values, display nothing
            return " ".join(v) if not None in v else None

        return self.bot.msg(
            destination["target"],
            format_key("message") + " ",
            embed=self.bot.server.embed(
                title=format_key("title"),
                desc=format_key("desc")[:2047],
                author_name=format_key("author_name"),
                author_link=format_key("author_link"),
                author_icon=format_key("author_icon"),
                footer=format_key("footer"),
                url=format_key("url"),
                color=format_key("color"),
                image=image,
            ),
        )
コード例 #18
0
def build_file(name):
    path = "posts/" + name + "/index.html"

    f = codecs.open(path, 'r')
    html = f.read()

    soup = BeautifulSoup(html, features="html5lib")

    # Save title
    md = "---"
    md += "\ntitle: " + soup.find("h1").get_text().strip()
    md += "\ndate: " + soup.find("time").get_text().replace("-", "/")
    md += "\ntags: " + soup.find("a", "tag-link").get_text()
    md += "\n---\n"

    content = soup.find(itemprop="articleBody")

    for child in content.children:
        if str(child)[:4] == '<img':
            end_point = child['src'].rfind('/') + 1
            file_name = child['src'][end_point:]
            text = "{% asset_img " + file_name + " %}"
            md += "\n" + text + "\n"
        elif str(child)[:2] == '<h':
            num_pounds = "#" * int(str(child)[2])
            md += "\n" + num_pounds + " " + child.get_text() + "\n"
        elif str(child)[:3] == '<if':
            md += "\n" + str(child) + "\n"
        elif str(child)[:24] == '<figure class="highlight':
            code_sample = str(child)

            code_type = code_sample[25:code_sample.find('"', 24)]

            temp_md = Tomd(str(child)).markdown
            temp_md = temp_md[temp_md.find('<td class="code"'):]
            temp_md = BeautifulSoup(temp_md, features="html5lib").find("pre")

            pre_md = str(temp_md)
            pre_md = pre_md[5:-6]

            temp_md = "\n``` "
            temp_md += code_type + '\n'
            for i, char in enumerate(pre_md):
                if pre_md[i:i + 5] == '<br/>':
                    temp_md += '\n'
                    temp_md += char
                else:
                    temp_md += char
            temp_md += '```'

            md += temp_md.replace('<br/>', '')
        else:
            md += Tomd(str(child)).markdown

    with open('posts/' + name + '.md', 'w') as file:
        file.write(md)
コード例 #19
0
ファイル: mdtools.py プロジェクト: kksworks/mdtistory
def html_to_markdown_proc_2(html_code_str, asset_folder_path):

    tmpHtmlCode = html_code_str

    target_html_file_path = str(
        PurePath(asset_folder_path).joinpath(TEMP_HTML_FILE_NAME))
    target_markdown_file_path = str(
        PurePath(asset_folder_path).joinpath(TEMP_MD_FILE_NAME))

    with open(target_html_file_path, 'w', encoding='utf-8') as html_file:
        html_file.write(tmpHtmlCode)
        html_file.close()

    markdown_str = Tomd(html_code_str).markdown

    return markdown_str
コード例 #20
0
def main(lang='ja'):

    alphabet = str(Path('.').absolute().name)
    contest_id = str(Path('.').absolute().parent.name)

    client = AtCoderClient()
    client.login()
    ic(client.check_logging_in())

    contest = Contest(ic(contest_id))
    ic(contest.get_url())
    problem_list = client.download_problem_list(ic(contest))
    problem = problem_list[['A', 'B', 'C', 'D', 'E', 'F'].index(ic(alphabet))]

    html_doc = client.download_problem_content(problem).original_html
    soup = BeautifulSoup(html_doc, 'html.parser')

    title = soup.find(class_="h2").get_text()
    task_statement = soup.find(id="task-statement")

    if lang == 'ja':
        task_statement = task_statement.find(class_='lang-ja')

    def sanitize_html_for_ipynb(html_doc):
        replace_dict = {
            '<var>': '$',
            '</var>': '$',
            '<pre>': '<pre><code>',
            '</pre>': '</code></pre>'
        }
        for old_word, new_word in replace_dict.items():
            html_doc = html_doc.replace(old_word, new_word)
        return ic(html_doc)

    title = str(sanitize_html_for_ipynb(str(title)))
    title = title.lstrip().split('\n')[0]

    task_statement = Tomd(sanitize_html_for_ipynb(str(task_statement)))
    with open('problem.md', 'w+') as f:
        f.write(f"## {ic(title)}\n")
        f.write('---\n')
        f.write(task_statement.markdown)
コード例 #21
0
def getDetailFromLocal(html):

    soup = BeautifulSoup(html, "html.parser")
    content = soup.select_one("#content_views")
    # 删除注释
    for useless_tag in content(text=lambda text: isinstance(text, Comment)):
        useless_tag.extract()
    # 删除无用标签
    tags = ["svg", "ul", ".hljs-button.signin"]
    delete_ele(content, tags)

    # 删除标签属性
    attrs = ["class", "name", "id", "onclick", "style", "data-token", "rel"]
    delete_ele_attr(content, attrs)

    # 删除空白标签
    eles_except = ["img", "br", "hr"]
    delete_blank_ele(content, eles_except)
    # 转换为markdown
    md = Tomd(str(content)).markdown
    return md
コード例 #22
0
    def parse_news(self, response):
        log.msg("Start to parse news " + response.url, level=log.INFO)
        item = SpiderNewsAllItem()
        day = title = _type = keywords = url = article = ''
        url = response.url
        day = response.meta['day']
        title = response.meta['title']
        _type = response.meta['_type']
        response = response.body
        soup = BeautifulSoup(response)
        #        try:
        #            items_keywords = soup.find(class_='ar_keywords').find_all('a')
        #            for i in range(0, len(items_keywords)):
        #                keywords += items_keywords[i].text.strip() + ' '
        #        except:
        #            log.msg("News " + title + " dont has keywords!", level=log.INFO)

        try:
            content_paragraph = soup.find("div", class_="text_info")
            article = []
            for tag in content_paragraph.find(
                    "div", class_="clear").previous_siblings:
                article.insert(0, tag)

            markdown = Tomd(''.join(
                str(article))).markdown.decode("unicode-escape")
            article = BeautifulSoup(''.join([str(tag) for tag in article
                                             ])).get_text().strip()
        except:
            log.msg("News " + title + " dont has article!", level=log.INFO)
        item['title'] = title
        item['day'] = day
        item['_type'] = _type
        item['url'] = url
        item['keywords'] = keywords
        item['article'] = article
        item['site'] = 'InfoQ'
        item['markdown'] = markdown
        return item
コード例 #23
0
ファイル: scraper.py プロジェクト: glenlancer/ebook-dl
 def _get_book_info_from_bs(book_bs):
     bookInfo = BookInfo()
     content_bs = book_bs.find('section', {'class': 'content'})
     if not content_bs:
         return None
     title_bs = content_bs.find('h3', {'class': 'product-title'})
     if title_bs:
         bookInfo.title = title_bs.get_text()
     if not bookInfo.title:
         return None
     details_bs = content_bs.find('div', {'class': 'details'})
     if details_bs:
         details_list_bs = details_bs.find('ul', {'class': 'list-unstyled'})
         if details_list_bs:
             bookInfo.details = details_list_bs.get_text().strip('\n')
     body_bs_list = content_bs.find_all('div', {'class': 'body'})
     if len(body_bs_list) == 6:
         description_bs = body_bs_list[3]
         bookInfo.description = Tomd(str(description_bs)).markdown.strip()
     download_bs = content_bs.find('span', {'class': 'tn-download'})
     if download_bs:
         bookInfo.tn_url.append(download_bs.get('tn-url'))
     return bookInfo
コード例 #24
0
def getArticle(url):
    text = url2str(url)
    hasUrl = len(osCmd('cat history.log | grep ' + url))
    if hasUrl == 0:
        os.system('echo ' + url + ' >> history.log')
        try:
            # 获取文章内容
            doc = pq(text)
            articleTitle = doc('.mainnews_l h1').text()
            articleDetails = doc('.mainnews_l .newsdetails')
            articleDate = getDate(doc('.mainnews_l .time').text())
            articleTime = str(int(time.time()))
            mdArticle = Tomd(str(articleDetails)\
                    .replace('&#13;','')\
                    .replace('\t','')\
                    ).markdown
            html = markdown.markdown(mdArticle)
            articleKey = getKeywords(articleTitle)
            if len(html) > 100:
                # 提交文章到系统
                curlCommand = "curl -s 'http://www.56.com/admin/index.php?controller=module&project=article&operate=save' -H 'Cookie: PHPSESSID=hm8kb0pg9l2cc9vgrve7als2r1' --data 'title={title}&classid=5&tags={key}&hits=0&info=&content={content}&time={time}&status=1&html=&template=%2Farticle_content.html&date={date}' --compressed".format(
                    title=articleTitle,
                    content=html,
                    time=articleTime,
                    date=articleDate,
                    key=articleKey)
                res = json.loads(osCmd(curlCommand)[0])['status']
                if res == 'y':
                    print('文章添加成功' + url)
                else:
                    print('文章添加失败' + url)
            else:
                print('文章内容过短')
        except:
            print('文章抓取内容出错: ' + url)
    else:
        print('重复文章: ' + url)
コード例 #25
0
ファイル: blog.py プロジェクト: polarisary/hexoblog
def blog(addr):
    r = requests.get(blog_domain+addr, headers=headers)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, 'html.parser')
    blog_title = soup.title.string
    file_name = blog_title.replace('  ', '-')
    file_name = file_name.replace(' ', '-')
    file_name = file_name.replace('.', '-')
    file_name = file_name.replace('/', '-')
    file_name = file_name.replace('+', '-')
    file_name = file_name.replace('_', '-')
    file_name = file_name.replace('(', '')
    file_name = file_name.replace(')', '')
    print (addr)
    # hexo new
    os.system('hexo new "{}"'.format(file_name))
    time.sleep(0.5);
    blog_title_arr = blog_title.split('·')
    blog_header = '''
---
title: {}
date: 2018-09-28 15:47:45
categories: [alidb-monthly, {}, {}]
tags: [{}, {}]
---
    '''.format(blog_title, blog_title_arr[0].strip(), blog_title_arr[1].strip(), blog_title_arr[0].strip(), blog_title_arr[1].strip())
    blog_footer = '''

## 郑重声明
> 文章来自淘宝技术博客 [数据库内核月报](http://mysql.taobao.org/monthly/2017/04/01/)
> 
> 本人为了学习方便而分类整理

    '''
    # print (soup.select('.post')[0].prettify())
    # print (blog_header + Tomd(str(soup.select('.post')[0])).markdown)
    write_file(file_name + '.md', blog_header + Tomd(str(soup.select('.post')[0])).markdown + blog_footer)
コード例 #26
0
    def get_question_detail(self, titleSlug):
        url = "https://leetcode-cn.com/graphql"
        headers = {
            "x-csrftoken": x_csrftoken,
            "referer": "https://leetcode-cn.com",
            "cookie": cookie,
            "Content-Type": "application/json"
        }
        payload = {
            "operationName": "getQuestionDetail",
            "variables": {"titleSlug": titleSlug},
            "query": query
        }

        try:
            r = requests.post(url, data=json.dumps(
                payload), headers=headers, timeout=5)
            r = json.loads(r.text)['data']['question']['content']
            md = Tomd(r).markdown
            return md
        except Exception as e:
            logging.error(e)
            time.sleep(1)
            return self.get_question_detail(titleSlug)
コード例 #27
0
def getOneArticle(article_url):
    '''
    article_url:blog的url
    提取数据,保存为md文件
    '''
    try:
        r = requests.get(article_url)#发送请求
        r.raise_for_status()#获取网页状态
        #初始化生成一个XPath解析对象
        selectors = Selector(r.text)
        #使用XPath选取每一篇blog的标题、内容,返回列表
        title = selectors.xpath('//h1[@class="title-article"]/text()').get()
        content = selectors.xpath('//article').get()
        #内容转换为md格式
        text = Tomd(content).markdown
        #使用空字符串替换不要的内容
        text = re.sub('<a.*?a>', '', text)
        print(title)
        #保存文件
        with open(title+'.md', 'w', encoding='utf-8') as f:
            f.write('#' + title)
            f.write(text)
    except:
        return "获取网页异常"
コード例 #28
0
ファイル: test_tomd.py プロジェクト: wiidi/tomd
<b>bold</b>
<i>italic</i>
<b><i>bold italic</i></b>
<em>em</em>
<strong>strong</strong>
</p>

<hr/>

<table>
<thead>
<tr class="1">
<th>th1</th>
<th>th2</th>
</tr>
</thead>
<tbody>
<tr>
<td>td</td>
<td>td</td>
</tr>
<tr>
<td>td</td>
<td>td</td>
</tr></tbody></table>

"""

print(Tomd(string).markdown)
print(tomd.convert(string))
コード例 #29
0
ファイル: tables.py プロジェクト: MagedSaeed/Python-Turorials
		<tr>
			<td>erf(x)</td>
			<td>Returns the error function at x</td>
		</tr>
		<tr>
			<td>erfc(x)</td>
			<td>Returns the complementary error function at x</td>
		</tr>
		<tr>
			<td>gamma(x)</td>
			<td>Returns the Gamma function at x</td>
		</tr>
		<tr>
			<td>lgamma(x)</td>
			<td>Returns the natural logarithm of the absolute value of the Gamma function at x</td>
		</tr>
		<tr>
			<td>pi</td>
			<td>Mathematical constant, the ratio of circumference of a circle to it&#39;s diameter (3.14159...)</td>
		</tr>
		<tr>
			<td>e</td>
			<td>mathematical constant e (2.71828...)</td>
		</tr>
	</tbody>
</table>
"""


print(Tomd(html).markdown)
コード例 #30
0
ファイル: scrap_yoga.py プロジェクト: DavidK1ng/randomsite
def get_single_page(inputurl, en):
    page = req.get(inputurl, headers=head).text
    bs_ = bs(page, "lxml")

    img_url = bs_.find(class_="entry-thumbnail")

    if img_url.img:

        img_url = img_url.img['src']

        for i in range(5):
            try:
                img = req.get(img_url, headers=head).content
                break
            except:
                print('Get Image Time Out')
                time.sleep(10)

        img_path = img_folder + img_url.split('/')[-1]

        with open('app/' + img_path, "wb") as f:
            f.write(img)
    else:
        img_path = ""

    title = bs_.find(class_="entry-title").string

    date = parse(bs_.find(class_="entry-date")['datetime'])

    body = str(bs_.find(class_="entry-content"))

    md = Tomd(body).markdown

    category = bs_.find(class_="entry-categories").a.string
    print(category)
    if not Category.query.filter_by(name=category).first():
        c = Category(name=category)
        db.session.add(c)
        db.session.commit()
        c = Category.query.filter_by(name=category).first()
        print(c)
    else:
        c = Category.query.filter_by(name=category).first()

    u = User.query.get(1)

    print(f'is_en value is: {en}')

    if Post.query.filter_by(title=title).first():
        p = Post.query.filter_by(title=title).first()
        p.title = title
        p.body = md
        p.timestamp = date
        p.cover = img_path
        p.category = c
        p.is_en = en
    else:
        p = Post(title=title,
                 body=md,
                 timestamp=date,
                 author=u,
                 cover=img_path,
                 category=c,
                 is_en=en)

    db.session.add(p)

    db.session.commit()

    return bs_