Exemple #1
0
def parse_challenges(folder, challenges):
    for challenge in challenges:
        name = challenge['name']
        print("[+] Parsing %s" % (name))
        path = '%s/%s' % (folder, name)

        try:
            os.makedirs(path)
        except Exception as e:
            pass
        markdown = ''
        markdown += '#### %s  \n\n' % (name)
        markdown += '%s  \n' % (tomd.Tomd(challenge['detail']).markdown)
        markdown += '#### Hint:  \n\n'
        markdown += '``` \n'
        markdown += '%s  \n' % (challenge['prompt'])
        markdown += '``` \n'
        markdown += '#### Information:  \n\n'
        url = challenge['url']
        if url.startswith("/file/"):
            file_path = '%s/files' % (path)
            try:
                os.makedirs(file_path)
            except Exception as e:
                pass
            download_link = '%s%s' % (website, url)
            download(download_link, file_path)
            filename = url.split("/")[-1]
            markdown += '* File: [%s](files/%s)  \n\n' % (filename, filename)
        else:
            markdown += '* Url: %s  \n\n' % (challenge['url'])
        markdown += '* Soved: %d  \n\n' % (challenge['user_solved'])
        with open('%s/README.md' % path, 'w') as f:
            f.write(markdown)
def main():
    headers = {
        'referer':
        'https://www.jianshu.com/p/c75f1ce0a6ae',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
    }

    url = 'https://www.jianshu.com/p/c75f1ce0a6ae'

    res = requests.get(url=url, headers=headers)

    # print(res.text)

    soup = BeautifulSoup(res.text)

    content = soup.select_one('div.show-content-free')
    # print(content)

    m = re.search('div\sclass="show-content-free">([\s\S]*)</div>',
                  str(content))
    if m:
        html = m.group(1)
        print(html)
        tomd.Tomd(html=html, file='test.md').export()
def spider_one_csdn(title_url):  # 目标文章的链接
    html = requests.get(url=title_url, headers=head).text
    page = parsel.Selector(html)
    #创建解释器
    title = page.css(".title-article::text").get()
    title = filter_str(title)
    print(title)
    content = page.css("article").get()
    content = re.sub("<a.*?a>", "", content)
    content = re.sub("<br>", "", content)
    #过滤a标签和br标签
    text = tomd.Tomd(content).markdown
    #转换为markdown 文件
    path = os.getcwd()  # 获取当前的目录路径
    file_name = "./passage"
    final_road = path + file_name
    try:
        os.mkdir(final_road)
        print('创建成功!')
    except:
        # print('目录已经存在或异常')
        pass
    with open(final_road + r"./" + title + ".md", mode="w",
              encoding="utf-8") as f:
        f.write("#" + title)
        f.write(text)
def process_conversion(file_name):

    cwd = os.getcwd()  # Get the current working directory (cwd)
    files = os.listdir(cwd)  # Get all the files in that directory
    print("Files in %r: %s" % (cwd, files))

    output_file_name = file_name.replace("html", "md")
    output_file = open(output_file_name, "w+")
    print("Output File Name             : ", output_file_name)

    with open(file_name, "r") as input_file:

        if converter == "html2markdown":
            md_str = html2markdown.convert(input_file)
            output_file.write(md_str)
        elif converter == "markdownify":
            md_str = md(input_file)
            output_file.write(md_str)
        elif converter == "tomd":
            md_str = tomd.Tomd(input_file.read()).markdown
            output_file.write(md_str)
        else:
            print("Not a valid converter")

    return input_file, output_file
def builduserselection():
    articles_to_build = []
    headers = {"Accept": "text/html"}
    articles_to_parse = request.args.get('selected_articles')
    for each in articles_to_parse:
        url = 'https://www.medium.com/' + articles_to_parse[each]['article_slug']
        r = request.get(url, headers=headers)
        a = BeautifulSoup(r.text)
        main = a.main

        for x in main.findAll('link'):
            x.extract()

        for x in main.findAll('hr'):
            x.extract()

        for x in main.findAll('nav'):
            x.extract()

        for x in main.findAll('path'):
            x.extract()
    
        soup_string = str(main)
        tomarkdown = tomd.Tomd(soup_string).markdown
        
        md_dict = {
                "article_id": articles_to_parse[each]['id'],
                "article_title": articles_to_parse[each]['title']
                "article_md": tomarkdown
                }

        articles_to_build.append(tomarkdown)
Exemple #6
0
def convert(path):
    print('Converting...')

    file_pattern = os.path.join(path, 'posts/*.xml')
    for f in glob.glob(file_pattern):
        data = {'title': '',
                'date': '',
                'draft': '',
                'category': '',
                'tags': '',
                'slug': '',
                'conten t': ''}
        # print(f)
        # import pdb;pdb.set_trace()
        document = untangle.parse(f)

        dts = document.post.pubDate.cdata

        data['title'] = document.post.title.cdata.replace('"', '')
        data['date'] = dts.replace(' ', 'T')
        data['draft'] = document.post.ispublished.cdata == 'False'
        data['category'] = _parse_categories(path, document.post.categories)
        data['tags'] = _parse_tags(document.post.tags)
        data['slug'] = document.post.slug.cdata
        data['content'] = tomd.Tomd(html.unescape(document.post.content.cdata)).markdown

        dt = datetime.strptime(dts, "%Y-%m-%d %H:%M:%S")
        data['year'] = dt.year
        # data['content'] = mad(html.unescape(document.post.content.cdata))
        with open('content/post/{}.md'.format(data['slug']), 'w+') as md:
            md.writelines(TEMPLATE.format(**data))
def spider_one_csdn(title_url):  # 目标文章的链接
    head = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.52",
        "Referer": "https://blog.csdn.net/tansty_zh"
    }
    html = requests.get(url=title_url, headers=head).text
    page = parsel.Selector(html)
    #创建解释器
    title = page.css(".title-article::text").get()
    print(title)
    content = page.css("article").get()
    content = re.sub("<a.*?a>", "", content)
    content = re.sub("<br>", "", content)
    #过滤a标签和br标签
    text = tomd.Tomd(content).markdown
    #转换为markdown 文件
    path = os.getcwd()  # 获取当前的目录路径
    file_name = "./passage"
    final_road = path + file_name
    try:
        os.mkdir(final_road)
        print('创建成功!')
    except:
        # print('目录已经存在或异常')
        pass
    with open(final_road + r"./" + title + ".md", mode="w",
              encoding="utf-8") as f:
        f.write("#" + title)
        f.write(text)
 def spider_csdn(self):
     # 目标文章的链接
     title_url = self.text.toPlainText()
     MessageBox = QMessageBox(self.windows)
     if not title_url:
         MessageBox.critical(self.windows, "错误", "请输入网址")
         return
     head = {
         "User-Agent":
         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.52"
     }
     html = requests.get(url=title_url, headers=head).text
     page = parsel.Selector(html)
     #创建解释器
     title = page.css(".title-article::text").get()
     res = re.compile("[^\u4e00-\u9fa5^a-z^A-Z^0-9]")
     restr = ''
     res.sub(restr, title)
     content = page.css("article").get()
     content = re.sub("<a.*?a>", "", content)
     content = re.sub("<br>", "", content)
     texts = tomd.Tomd(content).markdown
     #转换为markdown 文件
     with open(title + ".md", mode="w", encoding="utf-8") as f:
         f.write("#" + title)
         f.write(texts)
         MessageBox.information(self.windows, "正确", "获取文章完成")
Exemple #9
0
def reptile():
    page = 2
    for n in range(1, page):
        url = "https://www.cnblogs.com/yjmyzz/default.html?page=" + str(n)
        headers = {'User-Agent': random.choice(blog_headers)}
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')
        count = soup.find_all(class_='postTitle')
        for c in count:
            try:
                # 获取博客地址
                href = c.find('a').attrs['href']
                headers = {'User-Agent': random.choice(blog_headers)}
                res = requests.get(href, headers=headers)
                soup = BeautifulSoup(res.text, 'html.parser')
                # 获取博客内容
                content = soup.find('div', class_='blogpost-body')
                # 去掉博客外层的DIV
                content = content.decode_contents(formatter="html")
                # 获取博客标题
                title = soup.find('a', id='cb_post_title_url').text
                # 博客HTML转MD
                content = tomd.Tomd(content).markdown
                # 博客写入数据库
                write_db(title, content, href)
                print("已插入:{}".format(href))
            except Exception as e:
                print(e)
Exemple #10
0
def get_art(url):

    #url = 'https://cuiqingcai.com/8468.html'
    #time.sleep(1)
    print(url)
    res = requests.get(url, headers=header)
    time.sleep(1)
    html = etree.HTML(res.text)
    art_title = html.xpath('//h1[@class="article-title"]/a/text()')
    #art_content = html.xpath('//article[@class="article-content"]/p/text()')
    #p = re.compile('<article class="article-content">.*</article>')
    #art_content2 = p.search(res.text)
    #art_content2  = re.search('<article class="article-content">.*?</article>',res.text)
    soup = BS(res.text)
    content = soup.article
    #print(art_title)
    #print(res.text)
    #print(content)
    print('页面下载完毕,正在生成md文件')
    text = tomd.Tomd(str(content)).markdown
    print(art_title)
    #print(text)
    with open('./book/%s.md' % art_title, 'w', encoding='utf-8') as f:
        f.write('#%s' % art_title)
        f.write(text)
    print('%s----markdown文件下载完毕' % art_title[0])
Exemple #11
0
    def question_crawler(self):
        url = AlgorithmMd.SITE[self.site]['base_url'] + str(self.q_num)
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')

        title = soup.select('#problem_title')[0].string

        problem_description = soup.select('#problem_description')[0]
        org_image = problem_description.find('img')

        problem_description = tomd.Tomd(str(problem_description)).markdown

        problem_image = None

        if org_image:
            problem_image_src = org_image['src']
            problem_image = self.image_to_md(problem_image_src, title)
            print(org_image)
            print(problem_image)
            problem_description = problem_description.replace(
                str(org_image), problem_image)

        problem_input = soup.select('#problem_input')[0]
        problem_input = tomd.Tomd(str(problem_input)).markdown

        problem_output = soup.select('#problem_output')[0]
        problem_output = tomd.Tomd(str(problem_output)).markdown

        sample_input = soup.find_all(id='sample-input-1')[0].get_text()
        # sample_input = tomd.Tomd(str(sample_input)).markdown

        sample_output = soup.find_all(id='sample-output-1')[0].get_text()
        # sample_output = tomd.Tomd(str(sample_output)).markdown

        data = {
            'title': title,
            'problem_description': problem_description,
            'problem_input': problem_input,
            'problem_output': problem_output,
            'sample_input': sample_input,
            'sample_output': sample_output
        }

        print(data)
        return data
Exemple #12
0
def test_html_to_wiki():
    # 读取html格式文件
    with open(file_path, 'r', encoding='UTF-8') as f:
        htmlpage = f.read()
    # 处理html格式文件中的内容
    text = tomd.Tomd(htmlpage).markdown
    # 写入处理后的内容
    with open('test.md', 'w') as f:
        f.write(text)
Exemple #13
0
def save_page(url, filename):
    html = open_url(url)
    print("保存成网页:")
    with open(filename + ".html", "w", encoding="utf-8") as f:
        f.write(html)

    print("保存成md:")

    md = tomd.Tomd(html).markdown
    with open(filename + ".md", "w", encoding="utf-8") as f:
        f.write(md)
Exemple #14
0
    def parse_blog(self,response):
        article=response.xpath('//article/div[@class="asset-content entry-content"]').extract()
        mdTxt=tomd.Tomd(article[0]).markdown
        name=response.xpath("//article/h1/text()").extract()[0]
        cate=response.xpath('//div[@class="entry-categories"]//li/a/text()').extract()[0]

        path=os.path.expandvars('$HOME')+"/Downloads/ruanyifeng/"+cate+"/"
        if not os.path.exists(path):
            os.mkdir(path)
        with open(path+name+".md","w+") as f:
            f.write(mdTxt)
            f.close()
Exemple #15
0
def main():
    while True:

        try:

            for r in praw.models.util.stream_generator(reddit.inbox.mentions,
                                                       skip_existing=True):
                reply = ''

                if isinstance(r, praw.models.Comment):

                    call = r.body
                    post = r.parent()

                    if isinstance(post, praw.models.Comment):
                        body = post.body

                    else:
                        body = '# ' + post.title + '\n' + post.selftext

                    langs = parseCall(call)
                    if isinstance(langs, list):
                        html = mdToHTML(body)
                        originalText = getTextFromHTML(html)
                        originalTextAltered = repConstants(originalText)
                        result = translate(originalTextAltered, langs[0],
                                           langs[1])
                        if isinstance(result, list):
                            html = replaceHTMLWithTranslation(
                                html, originalText, result[0])
                            reply = tomd.Tomd(html).markdown
                            reply = formatTranslation(reply, result[1],
                                                      result[2])
                        elif result == 'error':
                            print("error")
                        else:
                            reply = 'Invalid syntax. ' + result
                    else:
                        reply = langs

                    reply = appendInfo(reply)
                    r.reply(reply)
                    print(reply + '\n')

        except praw.exceptions.APIException as e:
            print("waiting")
            time.sleep(try_get_seconds_to_wait(e))

        except RequestException:
            print("internet error")
Exemple #16
0
async def wiki(message):
    await client.send_typing(message.channel)

    search=message.content.split(" ",1)
    link="https://en.wikipedia.org/w/api.php"
    payload = {'action': 'query', 'list': 'search', 'format': 'json', 'srsearch': search[1]}
    page = requests.get(link, headers=headers, timeout=5, params=payload)
    page.encoding = 'UTF-8'
    response = json.loads(page.text)
    result = response['query']['search'][0]

    url = "https://en.wikipedia.org/wiki/" + result['title'].replace(" ", "_")

    msg = "{}: {}... {}".format(result['title'], tomd.Tomd(result['snippet'].split(".")[0]).markdown, url)
    return msg
def process(section_s):
	output_s = '# %s\n\n' % section_s
	note_s_l = []
	for i in section_d[section_s]:
		note_title = '## %s\n' % i[0]
		note_body = line_break.sub('\n', i[1])
		note_body = tomd.Tomd(note_body).markdown
		note_body = formula_inline.sub('$\g<1>$', note_body)
		note_body = formula_block.sub('$$\n\g<1>\n$$', note_body)
		note_body = lt.sub('<', note_body)
		note_body = gt.sub('>', note_body)
		note_body = amp.sub('&', note_body)
		note_body += '\n'
		note_s = note_title + note_body
		output_s += note_s
	return output_s
    def toMarkdown(self):
        """
        生成markdown
        :return:
        """
        def fixMd(md):
            # 去除图片生成md时产生重复地址
            md_split = md.split('\n')
            fix_md_str = []
            for i in range(len(md_split)):
                if i < len(md_split) - 2 and md_split[i] == md_split[i + 2] and md_split[i] \
                        and '```' not in md_split[i]:
                    continue
                fix_md_str.append(md_split[i])
            md_str_all = '\n'.join(fix_md_str)
            return md_str_all

        if self.body_html:
            md = tomd.Tomd(self.html).markdown
            self.body_md = fixMd(md)
Exemple #19
0
    def __init__(self, path, contetClasName):
        self.root = '/Users/amanda/Documents/mweb/mweb/collect-web/'
        self.root = '/Users/amanda/Downloads/mweb'

        self.name = self.file_name(path)[0]
        self.path = self.file_name(path)[1]

        folder_name = time.strftime("%Y%m%d", time.localtime())
        self.dist_folder = self.generatePath("./{0}".format(folder_name))
        self.dist_media = self.generatePath("./{0}/media".format(folder_name))

        str_html = self.read_file(path)

        soup = BeautifulSoup(str_html, 'lxml')  # lxml为解析器
        content = str(soup.select(contetClasName)[0])

        # to markdown
        markdown_content = tomd.Tomd(content).markdown
        print(markdown_content)
        self.cleanHTMLTargert(markdown_content)
def get_content(img_dir, url):

    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    a = soup.find("div", attrs={"class": "col-md-8 description"})
    img_url = soup.find("img", attrs={"id": "imageresource"})
    source_url = soup.find("span", attrs={"class": "description-source"})

    s = ''
    if img_url:
        img_src = img_url['src']
        img_name = img_src.split('/')[-1]
        img_path = '{}/{}'.format(img_dir, img_name)
        if not os.path.exists(img_path):
            with open(img_path, 'wb') as imgf:
                imgf.write(requests.get(BaseUrl + '/' + img_src).content)
        s += '![](./img/{})'.format(img_name) + '\n'
    s += tomd.Tomd(str(a)).markdown + '\n'
    if source_url:
        source_url = source_url.find('a').get('href')
        s += 'source: [source]({})\n'.format(source_url)
    return s
def parse_rss_feed(item):
    print(item)
    json_item = {
        'channel_url': item['url'],
        'channel_uuid': item['uuid'],
        'channel_title': item['title']
    }
    print(item['url'])
    info = feedparser.parse(item['url'])
    data_list = info['entries']
    print(len(data_list))
    if data_list:
        for data in data_list:
            value = data['content'][0]['value']
            value = tomd.Tomd(value).markdown
            json_item['content'] = markdown.markdown(value)
            json_item['url'] = data['link']
            json_item['updated'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 data['updated_parsed'])
            json_item['summary'] = data['summary']
            json_item['title'] = data['title']
            res = requests.post(ARTICLE_URL, json.dumps(json_item))
            print(res.content)
    else:
        data = urllib.request.urlopen(item['url']).read()
        soup = bs4.BeautifulSoup(data, "html.parser")
        ch = soup.findAll('channel')[0]
        a = ch.findAll('item')
        print('len(a)', len(a))
        for i in a:
            json_item['content'] = i('content:encoded')[0].extract().text
            json_item['url'] = i.guid.text
            json_item['updated'] = parse(
                i.pubdate.text, fuzzy=True).strftime("%Y-%m-%d %H:%M:%S")
            json_item['summary'] = i.description.text
            json_item['title'] = i.title.text
            res = requests.post(ARTICLE_URL, json.dumps(json_item))
            print(res.content)
Exemple #22
0
def edit_post(slug):
    if not session.get('logged_in'):
        abort(401)
    if request.method == 'POST':
        if request.form['title'] is '' or request.form['title'].isspace():
            flash('Title cannot be empty')
            return redirect(url_for('edit_post', slug=slug))
        if request.form['text'] is '' or request.form['text'].isspace():
            flash('Text cannot be empty')
            return redirect(url_for('edit_post', slug=slug))

        db = get_db()
        post = db.execute('select * from posts where slug == ?',
                          [slug]).fetchall()[0]
        title = request.form['title']
        new_slug = slug
        if title != post['title']:
            new_slug = format_slug(slugify(title))
        text = Markup(markdown.markdown(request.form['text']))
        # set publish status
        published = True if 'published' in request.form else False
        publish_date = str(datetime.now()) if published else "unpublished"
        db.execute(
            '''update posts set title = ?, slug = ?, text = ?,
                      published = ?, publish_date = ?
                      where slug == ?''',
            [title, new_slug, text, published, publish_date, slug])
        db.commit()
        return redirect(url_for('show_posts'))

    db = get_db()
    post = db.execute('select * from posts where slug == ?',
                      [slug]).fetchall()[0]
    # get post from slug
    text = tomd.Tomd(post['text']).markdown
    return render_template('edit.html', post=post, text=text, slug=slug)
Exemple #23
0
# -*- coding: utf-8 -*-
import tomd
import requests
import sys
import io

reload(sys)
sys.setdefaultencoding("utf8")

headers = {
    'User-Agent':
    'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}

# 这里填入网址
html_res = requests.get('https://输入需要转换的url', headers=headers)
html_res.encoding = 'utf-8'

# html = """这里是网页内容"""
html = html_res.text

md = tomd.Tomd(html).markdown

# 写入文件 在当前目录下
with io.open('./Title.md', 'wb') as f:
    f.write(md)

f.close()
print("任务已完成")
Exemple #24
0
    def desc_to_md(self, desc):
        md_head, file_pname = self.get_md_head_and_file_pname(desc)

        with open(file_pname, "w+", encoding='utf-8') as f:
            f.write(md_head.strip("\n") + tomd.Tomd(desc).markdown)
 def parseToMarkdown(self, param):
     return tomd.Tomd(str(param)).markdown
Exemple #26
0
 def html2markdown(cls, html):
     mdTxt = tomd.Tomd(html).markdown
     return mdTxt
Exemple #27
0
# 그런데, 복잡한 것은 에러 남.
#
\n\n\n"""
print(__doc__)

import os
import sys
import tomd as td
import html2text as ht

from pprint import pprint


HOME_NAME = "web_beautifulsoup_scrapping"
DIRS = os.path.dirname(__file__).partition(HOME_NAME)
HOME_DIR = DIRS[0] + DIRS[1] + "\\"

sys.path.append(HOME_DIR)

# FILE_NAME = "01_fruit.html" # N.G
FILE_NAME = "html_to_text_sample.html"    # O.K

FILE_W_DIR = HOME_DIR + "\\_statics\\" + FILE_NAME

with open(FILE_W_DIR, 'r', encoding='utf-8') as f:
    html = f.read()


a = td.Tomd(html).markdown
print(a)
Exemple #28
0
# June 27 2017
# Andrew Xia
# main program (for testing stuff)

import sys
sys.path.insert(0, 'tomd/')
import tomd
import re

FOLDER = "/home/andrew/Documents/Evernote_170625/Journal/Ch 08"
FILE = "[over]analysis.html"
CONTENT = ""

f = open(FOLDER + "/" + FILE)
for line in f:
    CONTENT += line

converter = tomd.Tomd(CONTENT, FOLDER, FILE)
converter.export('/home/andrew/Documents/Evernote_170625/Ch08_md')

# x = converter.markdown

# file = open('tmp.txt','w')
# file.write(x)
# file.close()
Exemple #29
0
def run():
    html = getHtml()
    #print(html)
    mdTxt = tomd.Tomd(html).markdown
    print('markdown :{}'.format(mdTxt))
Exemple #30
0
 print(tittle)
 infos = re.findall(r'<div class="col-md-3 info">(.*?)</div>',
                    str(base_req.content, 'utf-8', errors='ignore'),
                    re.S)
 img_url = re.findall(
     r'<a class="bigImage" href="(.*?)">',
     str(base_req.content, 'utf-8', errors='ignore'))
 sample_url = re.findall(
     r'<a class="sample-box" href="(.*?)">',
     str(base_req.content, 'utf-8', errors='ignore'))
 name = re.findall(r"<span style=\"color:#CC0000;\">(.*?)</span>",
                   str(base_req.content, 'utf-8', errors='ignore'),
                   re.S)
 print(infos)
 print(name)
 info = tomd.Tomd(infos[0]).markdown
 #.replace('\r\n','')
 print(info)
 print(img_url)
 print(sample_url)
 try:
     t = tittle[0]
     tittle[0] = t.replace('\n', '')
     t = tittle[0].replace(' ', '')
     t = re.sub(
         r'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+',
         '', t)
     t = t.replace('・', '')
     print(t)
 except IndexError:
     pass