def test_html_to_tables(self): for file in os.listdir(self.static_path): if file.endswith('.html'): html_path = os.path.join(self.static_path, file) rst = dashtable.html2rst(html_path) md = dashtable.html2md(html_path) rst_name = os.path.splitext(file)[0] + '.rst' rst_path = os.path.join(self.static_path, rst_name) rst_file = open(rst_path, 'r', encoding='utf-8') rst_text = rst_file.read().rstrip() rst_file.close() try: self.assertEqual(rst, rst_text) except AssertionError: print('MATCH ERROR: ' + ntpath.basename(html_path)) md_name = os.path.splitext(file)[0] + '.md' md_path = os.path.join(self.static_path, md_name) md_file = open(md_path, 'r', encoding='utf-8') md_text = md_file.read().rstrip() md_file.close() try: self.assertEqual(md, md_text) except AssertionError: print('MATCH ERROR: ' + ntpath.basename(html_path))
def build_md(doc=None): from com.sun.star.beans import PropertyValue if not doc: document = XSCRIPTCONTEXT.getDocument() else: document = doc html_url = os.path.join(os.path.expanduser('~'), 'temp.html') html_url = html_url.replace('\\', '/') if not html_url.startswith('/'): save_url = 'file:///' + html_url else: save_url = 'file://' + html_url props = [PropertyValue(Name='FilterName', Value='HTML (StarCalc)')] document.storeToURL(save_url, props) md = html2md(html_url) md_url = os.path.join(os.path.expanduser('~'), '.ascii_table.txt') md_url = md_url.replace('\\', '/') f = open(md_url, 'w') f.write(md) f.close() os.remove(html_url) if sys.platform == "win32": subprocess.call(['start',"", md_url], shell=True) else: subprocess.call(['xdg-open', md_url])
from dashtable import html2rst, html2md import subprocess import os for file in os.listdir(os.getcwd() + '/test_files'): if file.endswith('.html'): path = os.path.join(os.getcwd(), 'test_files', file) f = open(path, 'r') lines = f.readlines() f.close() string = ''.join(lines) converted_rst = html2rst(string) converted_md = html2md(string) md_name = os.path.splitext(path)[0] + '.md' md_file = open(md_name, 'r') md_lines = md_file.readlines() md_file.close() md_string = ''.join(md_lines).rstrip() if not md_string == converted_md: print('MarkDown Error: ' + file) rst_name = os.path.splitext(path)[0] + '.rst' rst_file = open(rst_name, 'r') rst_lines = rst_file.readlines() rst_file.close() rst_string = ''.join(rst_lines).rstrip() if not rst_string == converted_rst:
def fetchMe(url): if "http" not in url: print(url) urlToUse = 'https://developer.prod.oculus.com' + url r = requests.get(urlToUse) soup = BeautifulSoup(r.text, 'html5lib') if ":" not in soup.title.string: title = soup.title.string else: title = '"' + soup.title.string + '"' #description = soup.description.string firsth1 = soup.select_one("h1") if firsth1: soup.select_one("h1").decompose() imgs = soup.findAll("img") imageNumber = 0 for img in imgs: if 'https://www.facebook.com/tr?i' not in img['src']: print img['src'] extension = get_ext(img['src']) newFileStub = '/images/' + slugify( url.decode('utf-8')) + '-' + str(imageNumber) + extension newFilename = os.getcwd().replace('\\', '/') + newFileStub print newFilename """ if os.path.isfile(newFilename): os.remove(newFilename) urllib.urlretrieve(img['src'], newFilename) """ imageNumber = imageNumber + 1 img['src'] = newFileStub imgMD = md(str(img)) img.name = "p" img.string = imgMD uls = soup.findAll("ul") for ul in uls: ul.string = md(str(ul)) ul.name = 'p' ols = soup.findAll("ol") for ol in ols: ol.string = md(str(ol)) ol.name = 'p' brs = soup.findAll("br") for br in brs: br.replaceWith('\n') preS = soup.findAll("pre") for pre in preS: code = soup.new_tag('code') tmp = pre.string pre.string = '' code.string = tmp pre.append(code) samps = soup.findAll("samp") for samp in samps: samp.name = "code" tables = soup.findAll("table") for table in tables: print(table) tableMD = dashtable.html2md( unicode(table).encode('ascii', 'ignore')) print(tableMD) table.string = tableMD table.name = "p" links = soup.findAll("a") for link in links: link['href'] = link['href'].replace('https://developer.oculus.com', '') bodyHTML = str(soup.find(class_='documentation-content')) output = '---\n' output += 'title: ' + title + '\n' #output += 'description: ' + description + '\n' output += '---\n' #bodyMD = md(bodyHTML, heading_style='ATX') bodyMD = str(tomd.convert(str(bodyHTML))) #bodyMD = pypandoc.convert_text(bodyHTML, 'gfm', format='html') #print(bodyMD) output += bodyMD.decode('utf-8') if url[-1:] == '/': linkToUse = url[:-1] else: linkToUse = url outputFileName = os.getcwd().replace('\\', '/') + linkToUse + '.md' #print(outputFileName) # if file exists, delete it. otherwise, forge the path and write if os.path.isfile(outputFileName): os.remove(outputFileName) else: dirname = os.path.dirname(outputFileName) if not os.path.exists(dirname): os.makedirs(dirname) f = open(outputFileName, 'w') f.write(output.encode('utf-8')) f.close print(output.encode('utf-8'))
from dashtable import html2rst, html2md import subprocess import os for file in os.listdir(os.getcwd() + '/test_files'): if file.endswith('.html'): path = os.getcwd() + '/test_files/' + file f = open(path, 'r') lines = f.readlines() f.close() string = ''.join(lines) print(file) print(html2rst(string)) print('\n') print(html2md(string)) print('\n') script = os.path.join(os.getcwd(), 'dashtable/html2rst.py') filename = os.path.splitext(file)[0] outfile = os.path.join(os.getcwd(), 'test_files', filename + '.txt') subprocess.call(['python', script, path, outfile])
import dashtable print( dashtable.html2md(""" <table> <tr><th>Header 1</th><th>Header 2</th></tr> <tr><td>Data 1</td><td>Data 2</td></tr> </table> """))