def parse_challenges(folder, challenges): for challenge in challenges: name = challenge['name'] print("[+] Parsing %s" % (name)) path = '%s/%s' % (folder, name) try: os.makedirs(path) except Exception as e: pass markdown = '' markdown += '#### %s \n\n' % (name) markdown += '%s \n' % (tomd.Tomd(challenge['detail']).markdown) markdown += '#### Hint: \n\n' markdown += '``` \n' markdown += '%s \n' % (challenge['prompt']) markdown += '``` \n' markdown += '#### Information: \n\n' url = challenge['url'] if url.startswith("/file/"): file_path = '%s/files' % (path) try: os.makedirs(file_path) except Exception as e: pass download_link = '%s%s' % (website, url) download(download_link, file_path) filename = url.split("/")[-1] markdown += '* File: [%s](files/%s) \n\n' % (filename, filename) else: markdown += '* Url: %s \n\n' % (challenge['url']) markdown += '* Soved: %d \n\n' % (challenge['user_solved']) with open('%s/README.md' % path, 'w') as f: f.write(markdown)
def main(): headers = { 'referer': 'https://www.jianshu.com/p/c75f1ce0a6ae', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36' } url = 'https://www.jianshu.com/p/c75f1ce0a6ae' res = requests.get(url=url, headers=headers) # print(res.text) soup = BeautifulSoup(res.text) content = soup.select_one('div.show-content-free') # print(content) m = re.search('div\sclass="show-content-free">([\s\S]*)</div>', str(content)) if m: html = m.group(1) print(html) tomd.Tomd(html=html, file='test.md').export()
def spider_one_csdn(title_url): # 目标文章的链接 html = requests.get(url=title_url, headers=head).text page = parsel.Selector(html) #创建解释器 title = page.css(".title-article::text").get() title = filter_str(title) print(title) content = page.css("article").get() content = re.sub("<a.*?a>", "", content) content = re.sub("<br>", "", content) #过滤a标签和br标签 text = tomd.Tomd(content).markdown #转换为markdown 文件 path = os.getcwd() # 获取当前的目录路径 file_name = "./passage" final_road = path + file_name try: os.mkdir(final_road) print('创建成功!') except: # print('目录已经存在或异常') pass with open(final_road + r"./" + title + ".md", mode="w", encoding="utf-8") as f: f.write("#" + title) f.write(text)
def process_conversion(file_name): cwd = os.getcwd() # Get the current working directory (cwd) files = os.listdir(cwd) # Get all the files in that directory print("Files in %r: %s" % (cwd, files)) output_file_name = file_name.replace("html", "md") output_file = open(output_file_name, "w+") print("Output File Name : ", output_file_name) with open(file_name, "r") as input_file: if converter == "html2markdown": md_str = html2markdown.convert(input_file) output_file.write(md_str) elif converter == "markdownify": md_str = md(input_file) output_file.write(md_str) elif converter == "tomd": md_str = tomd.Tomd(input_file.read()).markdown output_file.write(md_str) else: print("Not a valid converter") return input_file, output_file
def builduserselection(): articles_to_build = [] headers = {"Accept": "text/html"} articles_to_parse = request.args.get('selected_articles') for each in articles_to_parse: url = 'https://www.medium.com/' + articles_to_parse[each]['article_slug'] r = request.get(url, headers=headers) a = BeautifulSoup(r.text) main = a.main for x in main.findAll('link'): x.extract() for x in main.findAll('hr'): x.extract() for x in main.findAll('nav'): x.extract() for x in main.findAll('path'): x.extract() soup_string = str(main) tomarkdown = tomd.Tomd(soup_string).markdown md_dict = { "article_id": articles_to_parse[each]['id'], "article_title": articles_to_parse[each]['title'] "article_md": tomarkdown } articles_to_build.append(tomarkdown)
def convert(path): print('Converting...') file_pattern = os.path.join(path, 'posts/*.xml') for f in glob.glob(file_pattern): data = {'title': '', 'date': '', 'draft': '', 'category': '', 'tags': '', 'slug': '', 'conten t': ''} # print(f) # import pdb;pdb.set_trace() document = untangle.parse(f) dts = document.post.pubDate.cdata data['title'] = document.post.title.cdata.replace('"', '') data['date'] = dts.replace(' ', 'T') data['draft'] = document.post.ispublished.cdata == 'False' data['category'] = _parse_categories(path, document.post.categories) data['tags'] = _parse_tags(document.post.tags) data['slug'] = document.post.slug.cdata data['content'] = tomd.Tomd(html.unescape(document.post.content.cdata)).markdown dt = datetime.strptime(dts, "%Y-%m-%d %H:%M:%S") data['year'] = dt.year # data['content'] = mad(html.unescape(document.post.content.cdata)) with open('content/post/{}.md'.format(data['slug']), 'w+') as md: md.writelines(TEMPLATE.format(**data))
def spider_one_csdn(title_url): # 目标文章的链接 head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.52", "Referer": "https://blog.csdn.net/tansty_zh" } html = requests.get(url=title_url, headers=head).text page = parsel.Selector(html) #创建解释器 title = page.css(".title-article::text").get() print(title) content = page.css("article").get() content = re.sub("<a.*?a>", "", content) content = re.sub("<br>", "", content) #过滤a标签和br标签 text = tomd.Tomd(content).markdown #转换为markdown 文件 path = os.getcwd() # 获取当前的目录路径 file_name = "./passage" final_road = path + file_name try: os.mkdir(final_road) print('创建成功!') except: # print('目录已经存在或异常') pass with open(final_road + r"./" + title + ".md", mode="w", encoding="utf-8") as f: f.write("#" + title) f.write(text)
def spider_csdn(self): # 目标文章的链接 title_url = self.text.toPlainText() MessageBox = QMessageBox(self.windows) if not title_url: MessageBox.critical(self.windows, "错误", "请输入网址") return head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.52" } html = requests.get(url=title_url, headers=head).text page = parsel.Selector(html) #创建解释器 title = page.css(".title-article::text").get() res = re.compile("[^\u4e00-\u9fa5^a-z^A-Z^0-9]") restr = '' res.sub(restr, title) content = page.css("article").get() content = re.sub("<a.*?a>", "", content) content = re.sub("<br>", "", content) texts = tomd.Tomd(content).markdown #转换为markdown 文件 with open(title + ".md", mode="w", encoding="utf-8") as f: f.write("#" + title) f.write(texts) MessageBox.information(self.windows, "正确", "获取文章完成")
def reptile(): page = 2 for n in range(1, page): url = "https://www.cnblogs.com/yjmyzz/default.html?page=" + str(n) headers = {'User-Agent': random.choice(blog_headers)} res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') count = soup.find_all(class_='postTitle') for c in count: try: # 获取博客地址 href = c.find('a').attrs['href'] headers = {'User-Agent': random.choice(blog_headers)} res = requests.get(href, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') # 获取博客内容 content = soup.find('div', class_='blogpost-body') # 去掉博客外层的DIV content = content.decode_contents(formatter="html") # 获取博客标题 title = soup.find('a', id='cb_post_title_url').text # 博客HTML转MD content = tomd.Tomd(content).markdown # 博客写入数据库 write_db(title, content, href) print("已插入:{}".format(href)) except Exception as e: print(e)
def get_art(url): #url = 'https://cuiqingcai.com/8468.html' #time.sleep(1) print(url) res = requests.get(url, headers=header) time.sleep(1) html = etree.HTML(res.text) art_title = html.xpath('//h1[@class="article-title"]/a/text()') #art_content = html.xpath('//article[@class="article-content"]/p/text()') #p = re.compile('<article class="article-content">.*</article>') #art_content2 = p.search(res.text) #art_content2 = re.search('<article class="article-content">.*?</article>',res.text) soup = BS(res.text) content = soup.article #print(art_title) #print(res.text) #print(content) print('页面下载完毕,正在生成md文件') text = tomd.Tomd(str(content)).markdown print(art_title) #print(text) with open('./book/%s.md' % art_title, 'w', encoding='utf-8') as f: f.write('#%s' % art_title) f.write(text) print('%s----markdown文件下载完毕' % art_title[0])
def question_crawler(self): url = AlgorithmMd.SITE[self.site]['base_url'] + str(self.q_num) res = requests.get(url) soup = BeautifulSoup(res.text, 'html.parser') title = soup.select('#problem_title')[0].string problem_description = soup.select('#problem_description')[0] org_image = problem_description.find('img') problem_description = tomd.Tomd(str(problem_description)).markdown problem_image = None if org_image: problem_image_src = org_image['src'] problem_image = self.image_to_md(problem_image_src, title) print(org_image) print(problem_image) problem_description = problem_description.replace( str(org_image), problem_image) problem_input = soup.select('#problem_input')[0] problem_input = tomd.Tomd(str(problem_input)).markdown problem_output = soup.select('#problem_output')[0] problem_output = tomd.Tomd(str(problem_output)).markdown sample_input = soup.find_all(id='sample-input-1')[0].get_text() # sample_input = tomd.Tomd(str(sample_input)).markdown sample_output = soup.find_all(id='sample-output-1')[0].get_text() # sample_output = tomd.Tomd(str(sample_output)).markdown data = { 'title': title, 'problem_description': problem_description, 'problem_input': problem_input, 'problem_output': problem_output, 'sample_input': sample_input, 'sample_output': sample_output } print(data) return data
def test_html_to_wiki(): # 读取html格式文件 with open(file_path, 'r', encoding='UTF-8') as f: htmlpage = f.read() # 处理html格式文件中的内容 text = tomd.Tomd(htmlpage).markdown # 写入处理后的内容 with open('test.md', 'w') as f: f.write(text)
def save_page(url, filename): html = open_url(url) print("保存成网页:") with open(filename + ".html", "w", encoding="utf-8") as f: f.write(html) print("保存成md:") md = tomd.Tomd(html).markdown with open(filename + ".md", "w", encoding="utf-8") as f: f.write(md)
def parse_blog(self,response): article=response.xpath('//article/div[@class="asset-content entry-content"]').extract() mdTxt=tomd.Tomd(article[0]).markdown name=response.xpath("//article/h1/text()").extract()[0] cate=response.xpath('//div[@class="entry-categories"]//li/a/text()').extract()[0] path=os.path.expandvars('$HOME')+"/Downloads/ruanyifeng/"+cate+"/" if not os.path.exists(path): os.mkdir(path) with open(path+name+".md","w+") as f: f.write(mdTxt) f.close()
def main(): while True: try: for r in praw.models.util.stream_generator(reddit.inbox.mentions, skip_existing=True): reply = '' if isinstance(r, praw.models.Comment): call = r.body post = r.parent() if isinstance(post, praw.models.Comment): body = post.body else: body = '# ' + post.title + '\n' + post.selftext langs = parseCall(call) if isinstance(langs, list): html = mdToHTML(body) originalText = getTextFromHTML(html) originalTextAltered = repConstants(originalText) result = translate(originalTextAltered, langs[0], langs[1]) if isinstance(result, list): html = replaceHTMLWithTranslation( html, originalText, result[0]) reply = tomd.Tomd(html).markdown reply = formatTranslation(reply, result[1], result[2]) elif result == 'error': print("error") else: reply = 'Invalid syntax. ' + result else: reply = langs reply = appendInfo(reply) r.reply(reply) print(reply + '\n') except praw.exceptions.APIException as e: print("waiting") time.sleep(try_get_seconds_to_wait(e)) except RequestException: print("internet error")
async def wiki(message): await client.send_typing(message.channel) search=message.content.split(" ",1) link="https://en.wikipedia.org/w/api.php" payload = {'action': 'query', 'list': 'search', 'format': 'json', 'srsearch': search[1]} page = requests.get(link, headers=headers, timeout=5, params=payload) page.encoding = 'UTF-8' response = json.loads(page.text) result = response['query']['search'][0] url = "https://en.wikipedia.org/wiki/" + result['title'].replace(" ", "_") msg = "{}: {}... {}".format(result['title'], tomd.Tomd(result['snippet'].split(".")[0]).markdown, url) return msg
def process(section_s): output_s = '# %s\n\n' % section_s note_s_l = [] for i in section_d[section_s]: note_title = '## %s\n' % i[0] note_body = line_break.sub('\n', i[1]) note_body = tomd.Tomd(note_body).markdown note_body = formula_inline.sub('$\g<1>$', note_body) note_body = formula_block.sub('$$\n\g<1>\n$$', note_body) note_body = lt.sub('<', note_body) note_body = gt.sub('>', note_body) note_body = amp.sub('&', note_body) note_body += '\n' note_s = note_title + note_body output_s += note_s return output_s
def toMarkdown(self): """ 生成markdown :return: """ def fixMd(md): # 去除图片生成md时产生重复地址 md_split = md.split('\n') fix_md_str = [] for i in range(len(md_split)): if i < len(md_split) - 2 and md_split[i] == md_split[i + 2] and md_split[i] \ and '```' not in md_split[i]: continue fix_md_str.append(md_split[i]) md_str_all = '\n'.join(fix_md_str) return md_str_all if self.body_html: md = tomd.Tomd(self.html).markdown self.body_md = fixMd(md)
def __init__(self, path, contetClasName): self.root = '/Users/amanda/Documents/mweb/mweb/collect-web/' self.root = '/Users/amanda/Downloads/mweb' self.name = self.file_name(path)[0] self.path = self.file_name(path)[1] folder_name = time.strftime("%Y%m%d", time.localtime()) self.dist_folder = self.generatePath("./{0}".format(folder_name)) self.dist_media = self.generatePath("./{0}/media".format(folder_name)) str_html = self.read_file(path) soup = BeautifulSoup(str_html, 'lxml') # lxml为解析器 content = str(soup.select(contetClasName)[0]) # to markdown markdown_content = tomd.Tomd(content).markdown print(markdown_content) self.cleanHTMLTargert(markdown_content)
def get_content(img_dir, url): soup = BeautifulSoup(requests.get(url).text, 'html.parser') a = soup.find("div", attrs={"class": "col-md-8 description"}) img_url = soup.find("img", attrs={"id": "imageresource"}) source_url = soup.find("span", attrs={"class": "description-source"}) s = '' if img_url: img_src = img_url['src'] img_name = img_src.split('/')[-1] img_path = '{}/{}'.format(img_dir, img_name) if not os.path.exists(img_path): with open(img_path, 'wb') as imgf: imgf.write(requests.get(BaseUrl + '/' + img_src).content) s += '![](./img/{})'.format(img_name) + '\n' s += tomd.Tomd(str(a)).markdown + '\n' if source_url: source_url = source_url.find('a').get('href') s += 'source: [source]({})\n'.format(source_url) return s
def parse_rss_feed(item): print(item) json_item = { 'channel_url': item['url'], 'channel_uuid': item['uuid'], 'channel_title': item['title'] } print(item['url']) info = feedparser.parse(item['url']) data_list = info['entries'] print(len(data_list)) if data_list: for data in data_list: value = data['content'][0]['value'] value = tomd.Tomd(value).markdown json_item['content'] = markdown.markdown(value) json_item['url'] = data['link'] json_item['updated'] = time.strftime('%Y-%m-%d %H:%M:%S', data['updated_parsed']) json_item['summary'] = data['summary'] json_item['title'] = data['title'] res = requests.post(ARTICLE_URL, json.dumps(json_item)) print(res.content) else: data = urllib.request.urlopen(item['url']).read() soup = bs4.BeautifulSoup(data, "html.parser") ch = soup.findAll('channel')[0] a = ch.findAll('item') print('len(a)', len(a)) for i in a: json_item['content'] = i('content:encoded')[0].extract().text json_item['url'] = i.guid.text json_item['updated'] = parse( i.pubdate.text, fuzzy=True).strftime("%Y-%m-%d %H:%M:%S") json_item['summary'] = i.description.text json_item['title'] = i.title.text res = requests.post(ARTICLE_URL, json.dumps(json_item)) print(res.content)
def edit_post(slug): if not session.get('logged_in'): abort(401) if request.method == 'POST': if request.form['title'] is '' or request.form['title'].isspace(): flash('Title cannot be empty') return redirect(url_for('edit_post', slug=slug)) if request.form['text'] is '' or request.form['text'].isspace(): flash('Text cannot be empty') return redirect(url_for('edit_post', slug=slug)) db = get_db() post = db.execute('select * from posts where slug == ?', [slug]).fetchall()[0] title = request.form['title'] new_slug = slug if title != post['title']: new_slug = format_slug(slugify(title)) text = Markup(markdown.markdown(request.form['text'])) # set publish status published = True if 'published' in request.form else False publish_date = str(datetime.now()) if published else "unpublished" db.execute( '''update posts set title = ?, slug = ?, text = ?, published = ?, publish_date = ? where slug == ?''', [title, new_slug, text, published, publish_date, slug]) db.commit() return redirect(url_for('show_posts')) db = get_db() post = db.execute('select * from posts where slug == ?', [slug]).fetchall()[0] # get post from slug text = tomd.Tomd(post['text']).markdown return render_template('edit.html', post=post, text=text, slug=slug)
# -*- coding: utf-8 -*- import tomd import requests import sys import io reload(sys) sys.setdefaultencoding("utf8") headers = { 'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } # 这里填入网址 html_res = requests.get('https://输入需要转换的url', headers=headers) html_res.encoding = 'utf-8' # html = """这里是网页内容""" html = html_res.text md = tomd.Tomd(html).markdown # 写入文件 在当前目录下 with io.open('./Title.md', 'wb') as f: f.write(md) f.close() print("任务已完成")
def desc_to_md(self, desc): md_head, file_pname = self.get_md_head_and_file_pname(desc) with open(file_pname, "w+", encoding='utf-8') as f: f.write(md_head.strip("\n") + tomd.Tomd(desc).markdown)
def parseToMarkdown(self, param): return tomd.Tomd(str(param)).markdown
def html2markdown(cls, html): mdTxt = tomd.Tomd(html).markdown return mdTxt
# 그런데, 복잡한 것은 에러 남. # \n\n\n""" print(__doc__) import os import sys import tomd as td import html2text as ht from pprint import pprint HOME_NAME = "web_beautifulsoup_scrapping" DIRS = os.path.dirname(__file__).partition(HOME_NAME) HOME_DIR = DIRS[0] + DIRS[1] + "\\" sys.path.append(HOME_DIR) # FILE_NAME = "01_fruit.html" # N.G FILE_NAME = "html_to_text_sample.html" # O.K FILE_W_DIR = HOME_DIR + "\\_statics\\" + FILE_NAME with open(FILE_W_DIR, 'r', encoding='utf-8') as f: html = f.read() a = td.Tomd(html).markdown print(a)
# June 27 2017 # Andrew Xia # main program (for testing stuff) import sys sys.path.insert(0, 'tomd/') import tomd import re FOLDER = "/home/andrew/Documents/Evernote_170625/Journal/Ch 08" FILE = "[over]analysis.html" CONTENT = "" f = open(FOLDER + "/" + FILE) for line in f: CONTENT += line converter = tomd.Tomd(CONTENT, FOLDER, FILE) converter.export('/home/andrew/Documents/Evernote_170625/Ch08_md') # x = converter.markdown # file = open('tmp.txt','w') # file.write(x) # file.close()
def run(): html = getHtml() #print(html) mdTxt = tomd.Tomd(html).markdown print('markdown :{}'.format(mdTxt))
print(tittle) infos = re.findall(r'<div class="col-md-3 info">(.*?)</div>', str(base_req.content, 'utf-8', errors='ignore'), re.S) img_url = re.findall( r'<a class="bigImage" href="(.*?)">', str(base_req.content, 'utf-8', errors='ignore')) sample_url = re.findall( r'<a class="sample-box" href="(.*?)">', str(base_req.content, 'utf-8', errors='ignore')) name = re.findall(r"<span style=\"color:#CC0000;\">(.*?)</span>", str(base_req.content, 'utf-8', errors='ignore'), re.S) print(infos) print(name) info = tomd.Tomd(infos[0]).markdown #.replace('\r\n','') print(info) print(img_url) print(sample_url) try: t = tittle[0] tittle[0] = t.replace('\n', '') t = tittle[0].replace(' ', '') t = re.sub( r'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+', '', t) t = t.replace('・', '') print(t) except IndexError: pass