def test_header(self): result = html2markdown.convert('<p># test</p>') bs = bs4.BeautifulSoup(markdown.markdown(result), 'html.parser') self.assertEqual(len(bs.find_all('h1')), 0) result = html2markdown.convert('<p><h1>test</h1></p>') bs = bs4.BeautifulSoup(markdown.markdown(result), 'html.parser') self.assertEqual(len(bs.find_all('h1')), 1)
def test_links(self): result = html2markdown.convert('<p>[http://google.com](test)</p>') bs = bs4.BeautifulSoup(markdown.markdown(result), 'html.parser') self.assertEqual(len(bs.find_all('a')), 0) result = html2markdown.convert( '<p><a href="http://google.com">test</a></p>') bs = bs4.BeautifulSoup(markdown.markdown(result), 'html.parser') self.assertEqual(len(bs.find_all('a')), 1)
def charger_article(f: Path,subfolder) -> (str, str, str, str, Dt,str): """ Charge un fichier markdown comme article. Parameters ---------- f : Path Chemin de l'article à charger. Returns ------- (str, str, str, str, Dt) Arguments pour la fonction nouvel_item. """ global_link="https://l-electron-libre.github.io/texte/"+subfolder pubDate = Dt.fromtimestamp(f.stat().st_ctime) with f.open(encoding='utf-8') as d: soup = BeautifulSoup(d.read(),features="html.parser") soup2 = soup.find("section", {"id": "One"}) soup2 = soup2.findChildren(recursive=False)[0] title = soup2.find("h2") title = unidecode.unidecode(title.get_text()) link =global_link+"/"+f.name soup = soup.find("section", {"id": "two"}) soup = soup.findChildren(recursive=False)[0] title2=soup.find("h2") title2 = unidecode.unidecode(title2.get_text()) title = title + " - " +html2markdown.convert(title2) author = soup.find("h3") if author is None : author="" else : author=html2markdown.convert(author.get_text()) description = soup.find("p") description = unidecode.unidecode(description.get_text()) description = html2markdown.convert(description) # L'ordre est important, voir nouvel_item return title, link, author, description,pubDate
def pack_problem(url): data = fetch(url) qid = data["questionId"] title = data["title"] difficulty = data["difficulty"] stat = json.loads(data["stats"]) content = data["content"] md = html2markdown.convert(content) md = convert_html_tags(md) d = Path(f'{qid}_{title.replace(" ", "_")}') d.mkdir() with open(d / "README.md", "w") as f: f.write(f"### [{qid}. {title}]({url})\n\n") f.write(f"{difficulty}\n\n") f.write(f"{md}\n\n") f.write(convert_stat_table(stat)) with open(d / "NOTE.md", "w") as f: f.write(f"# Notes on Success\n") f.write(f"+ \n\n") f.write(f"> Time : O() , Space : O()") code = [s for s in data["codeSnippets"] if s["lang"] == "Python3"][0]["code"] with open(d / "solution.py", "w") as f: f.write( TEMPLATE.format(content=remove_html_tags(content).strip(), code=code.strip()).strip())
def embed_formatter(self, search_terms, results, count): results = self.searcher._dedupe(results) total = len(results) if total < count: count = total if count == 0: return [f"Query {search_terms} yielded no results."] output = [ f"Query {search_terms} yielded {total} results. Showing the top {count}:" ] for result in results[:count]: text = result.highlights("content", top=2) text = re.sub(r'<b class=".+?">', '<b>', text) text = html2markdown.convert(text) embed = discord.Embed(color=0x883333) embed.title = result['title'] embed.url = self.url(result) embed.description = f'...{text}...' output.append(embed) return output
def trait_cleanup_pass(struct): assert 'sections' not in struct, struct # Right now no traits have other sections trait = struct['trait'] if len(trait['sections']) == 0: del trait['sections'] else: assert False, struct soup = BeautifulSoup(trait['text'], "html.parser") first = list(soup.children)[0] if first.name == "i": text = get_text(first) if text.find("Note from Nethys:") > -1: first.clear() first.unwrap() trait['text'] = str(soup).strip() if trait['text'] != "": assert 'text' not in struct, struct struct['text'] = html2markdown.convert(trait['text']) if len(trait.get('sections', [])) > 0: assert 'sections' not in struct, struct struct['sections'] = trait['sections'] if trait.get('classes'): struct['classes'] = trait['classes'] if trait.get('links'): assert 'links' not in struct, struct struct['links'] = trait['links'] del struct['trait']
def test_inline_tag_escaping(self): """formatting characters should be escaped for inline-type tags""" for escChar in self.escapedChars: testStr = '<span>**escape me**</span>' expectedStr = '<span>\*\*escape me\*\*</span>' mdStr = html2markdown.convert(testStr) self.assertEqual(mdStr, expectedStr)
def _markdown_formatter(search_terms, results, count): """ Prepare an array of text output fromm a result set. """ results = Searcher._dedupe(results) total = len(results) if total < count: count = total if count == 0: return [f"Your query {search_terms} yielded no results."] output = [ f"Your query {search_terms} yielded {total} results. Showing the top {count}:" ] for result in results[:count]: text = result.highlights("content", top=2) text = re.sub(r'<b class=".+?">', '<b>', text) text = html2markdown.convert(text) output.append(result['title'] + '\n' + textwrap.indent( textwrap.fill(f'...{text}...', width=120), prefix=' ')) return output
def gen_description(soup): description = {} description["plaintext"] = escape_description(soup.text) description["html"] = get_html(soup) description["markdown"] = html.unescape( html2markdown.convert(description["html"])) return description
def test_block_tag_escaping(self): """formatting characters should NOT be escaped for block-type tags (except <p>)""" for escChar in self.escapableChars: testStr = '<div>**escape me**</div>'.replace('*', escChar) expectedStr = '<div>**escape me**</div>'.replace('*', escChar) mdStr = html2markdown.convert(testStr) self.assertEqual(mdStr, expectedStr)
def test_p_escaping(self): """formatting characters should be escaped for p tags""" for escChar in self.escapedChars: testStr = '<p>**escape me**</p>'.replace('*', escChar) expectedStr = '\*\*escape me\*\*'.replace('*', escChar) mdStr = html2markdown.convert(testStr) self.assertEqual(mdStr, expectedStr)
def test_p_escaping_2(self): """ensure all escapable characters are retained for <p>""" for escChar in self.escapableChars: testStr = '<p>**escape me**</p>'.replace('*', escChar) mdStr = html2markdown.convert(testStr) reconstructedStr = markdown.markdown(mdStr) self.assertEqual(reconstructedStr, testStr)
def test_inline_tag_escaping_2(self): """ensure all escapable characters are retained for inline-type tags""" for escChar in self.escapableChars: testStr = '<p><span>**escape me**</span></p>' mdStr = html2markdown.convert(testStr) reconstructedStr = markdown.markdown(mdStr) self.assertEqual(reconstructedStr, testStr)
def process_conversion(file_name): cwd = os.getcwd() # Get the current working directory (cwd) files = os.listdir(cwd) # Get all the files in that directory print("Files in %r: %s" % (cwd, files)) output_file_name = file_name.replace("html", "md") output_file = open(output_file_name, "w+") print("Output File Name : ", output_file_name) with open(file_name, "r") as input_file: if converter == "html2markdown": md_str = html2markdown.convert(input_file) output_file.write(md_str) elif converter == "markdownify": md_str = md(input_file) output_file.write(md_str) elif converter == "tomd": md_str = tomd.Tomd(input_file.read()).markdown output_file.write(md_str) else: print("Not a valid converter") return input_file, output_file
def save_article(post_res): """抓取文章并保存 Arguments: post_res {[dict]} -- [文章数据结构] """ sha256 = hashlib.sha256() sha256.update(post_res['url'].encode('utf8')) hash_key = sha256.hexdigest() try: post = Posts.get(Posts.hash_key == hash_key) logging.error('文章已存在') return except Exception as e: logging.error(e) post = Posts() post.url = post_res['url'] post.title = post_res['title'] post.author = post_res['author'] content = post_res['content'] content_md = html2markdown.convert(content) post.content = content post.content_md = content_md post.hash_key = hash_key post.create_time = datetime.datetime.now() post.pub_time = post_res['pub_time'] post.source = post_res['source'] post.save() if post.get_id(): logging.info('保存文章%s' % post.title)
def convertFahrplan(parsedHTML: BeautifulSoup) -> Calendar: events = [] congressTimeZone = pytz.timezone("Europe/Berlin") for el in parsedHTML.select("script[type='application/json']"): ej = json.loads(el.text) startT = ej["schedule_start"].replace("noon", "12:00:00").replace( "midnight", "00:00:00") startT = dateutil.parser.parse(startT).replace(tzinfo=congressTimeZone) duration = parseTimeDeltaStr(ej["schedule_duration"]) descriptionText = html2markdown.convert(ej["description_html"]) evt = Event() evt.add("uid", el["id"] + "@frab.cccv.de") speakers = ej["speakers"].split(", ") # for s in speakers: # evt.add("attendee;CN=" + ''.join(e for e in s if e.isalnum()), "") evt.add( "summary", "[" + ej["language"] + "] " + ej["title"] + "; " + ej["speakers"]) evt.add("description", ej["track_name"] + "\n" + descriptionText) evt.add("location", ej["room_name"]) evt.add("DTSTART", startT) evt.add("DTEND", startT + duration) evt.add("name", "[" + ej["language"] + "]" + ej["title"]) events.append(evt) cal = Calendar() for evt in events: cal.add_component(evt) cal["summary"] = "Remote Congress Experience" return cal
def ParseChunk(self, frag): #print(dir(frag)) ftg = frag.toHtml() gth = self.CleanGarbage(ftg) md = html2markdown.convert(gth) mth = markdown.markdown(md) stripped = self.StripP(mth) return stripped
def test_table_tag(self): """<table> tags should be converted. columns should preserve width across rows. td|th tag attr style="text-align: [left|center|right]" should be observed """ testStr = '<table><thead><tr><th><b>One</b></th><th style="text-align: right">Two</th></tr></thead>' \ '<tbody><tr><td>Line 1</td><td>Second Line</td></tr></tbody></table>' expectedStr = u'| One | Two |\n| ------ | -----------:|\n| Line 1 | Second Line |' mdStr = html2markdown.convert(testStr) self.assertEqual(mdStr, expectedStr)
def test_h2(self): mdStr = html2markdown.convert(self.genericStr) reconstructedStr = markdown.markdown(mdStr) bs = bs4.BeautifulSoup(reconstructedStr, 'html.parser') childTags = bs.find_all(recursive=False) self.assertEqual(childTags[1].name, 'h2') self.assertEqual(childTags[1].string, 'Test')
def export_article(data): ts = int(data['datetime']) dt = datetime.utcfromtimestamp(ts) data['datetime'] = dt del_keys = ['l18n_diffsource'] for key in del_keys: if key in data: del data[key] printdata = dict(data) if 'bodytext' in printdata: del printdata['bodytext'] #pprint(printdata) frontmatter = { 'title': data['title'], 'summary': data.get('short'), 'date': data['datetime'].isoformat(), } if data.get('author'): frontmatter['author'] = data.get('author') folder_path = 'export/%s-%s' % (dt.strftime('%Y-%m-%d'), data['id']) os.makedirs(folder_path, exist_ok=True) if data.get('image'): resources = [] filenames = data.get('image').split(',') for fn in filenames: fid = None if fn not in content_files_filename_to_id: print("Error: Image %s not in content_files_filename_to_id" % fn) continue fid = content_files_filename_to_id[fn] res = {'src': fn, 'title': ''} resources.append(res) # store file res_path = folder_path + "/" + fn with open(res_path, 'wb') as resfile: content = base64.b64decode(content_files[fid]['content']) resfile.write(content) frontmatter['resources'] = resources file_path = folder_path + '/index.md' with open(file_path, 'w') as myfile: myfile.write('---\n') myfile.write(yaml.dump(frontmatter, default_flow_style=False)) myfile.write('---\n\n') myfile.write(html2markdown.convert(data['bodytext']) + '\n')
def htmltomarkdown(text): """ Safely convert html to markdown """ try: content = html2markdown.convert(text) except Exception as exc: logger.error(exc) content = text return content
def edit(request): title = request.POST.get('title') entry = request.POST.get('entry') return render( request, "encyclopedia/edit.html", { "form": NewEntryForm(initial={ 'title': title, 'entry': html2markdown.convert(entry) }) })
def _sanitize_markdown(mdtext): "Removes unsafe text content from Markdown" dirty_html = markdown.markdown(mdtext) clean_html = bleach.clean(dirty_html, strip=True, tags=[ *bleach.sanitizer.ALLOWED_TAGS, "h1", "h2", "h3", "h4", "h5", "h6" ]) print(clean_html) return html2markdown.convert(clean_html)
def do_search(n_clicks, search_text): fragment_list = [] if search_text is None: data_frame = pd.DataFrame([]) data_frame['Date'] = [] data_frame['Marker'] = [] else: # TODO Where is the best place to call this? env.attachCurrentThread() hits = textSearcher.find_documents(search_text) if len(hits.scoreDocs) == 0: data_frame = pd.DataFrame([]) data_frame['Date'] = [] data_frame['Marker'] = [] else: date_list = [] for hit in hits.scoreDocs: document_number = hit.doc document = textSearcher.get_document(document_number) doc_name = document.getField("doc_name") date = datetime.datetime.strptime(doc_name.stringValue(), '%m%d%y') date_list.append(date) data_frame = pd.DataFrame(date_list) data_frame['Marker'] = ['1'] * len(date_list) highlighted_hits = textSearcher.get_highlighted_hits() for highlighted_hit in highlighted_hits: counter = 0 for hit in highlighted_hit[1]: fragment_list.append( html.Li( html.A(dcc.Markdown(html2markdown.convert(hit)), id={ 'type': 'hit_document', 'index': highlighted_hit[0] }))) counter += 1 data_frame.columns = ['Date', 'Marker'] scatterplot = px.scatter(data_frame, x="Date", y="Marker", range_x=['2015-01-01', '2017-12-31']) print(fragment_list) return scatterplot, html.Ul(fragment_list)
def htmltomarkdown(text): """ Safely convert html to markdown """ try: content = html2markdown.convert(text) except Exception as exc: logger.warning(f"error={exc};text={text[:100]}") content = html.escape(text) return content
def test_inline_tag_break(self): """inline-type tags should not cause line breaks""" emptyElements = self.emptyElements for tag in html2markdown._inlineTags: if tag not in emptyElements: testStr = '<p>test <%s>test</%s> test</p>' % (tag, tag) else: testStr = '<p>test <%s /> test</p>' % tag mdStr = html2markdown.convert(testStr) bs = bs4.BeautifulSoup(markdown.markdown(mdStr), 'html.parser') self.assertEqual(len(bs.find_all('p')), 1)
def __parser(self, doc): soup = BS(doc, "html.parser") table = soup.select_one("div.box.box-primary.collapsed-box") table = table.select("table")[1] entry = parse_table(table)[0] self.__turma = entry["Turma"].text.strip() self.__prof = entry["Docentes"].text.strip() avaliacoes = soup.select_one("div#avaliacoes") situacao = avaliacoes.select_one("div.row.color-gray").select_one( "span") situacao.select_one("strong").extract() self.__situacao = situacao.text.strip() ma = avaliacoes.select_one("div#ma") formula = ma.select("span") self.__formula = formula[1].text.strip() if len(formula) >= 2 else None mf = Nota._parser(ma.select_one("div.row")) self.__notas = [mf] if mf else [] divs = (avaliacoes.select("div.box.box-primary")[1].select_one( "div.row").find_all("div", recursive=False)) for div in divs: for row in div.select("div.row")[1:-1]: nota = Nota._parser(row) if nota: self.__notas.append(nota) frequencias = soup.select_one("div#frequencias") self.__frequencias = [] for div in frequencias.select("div.box.box-primary")[1:]: year = div.select_one("div").text.strip().split("/")[-1] table = div.select_one("table>tbody") for tr in table.select("tr"): day_month, hour_minute, tipo, *chamadas = ( elem.text.strip() for elem in tr.select("td")) self.__frequencias.append( Frequencia._parser(year, day_month, hour_minute, tipo, chamadas)) planos = soup.select_one("div#planos") for h3 in planos.select("h3"): del h3.attrs["class"] planos = "".join( re.compile(r"<div.*>|<\/div>|\n+|\t+").split(str(planos))) self.__ementa = html2markdown.convert(planos)
def get_readme_content(self, dataset): return """# {0} [![DOI](https://www.zenodo.org/badge/DOI/{1}.svg)](https://doi.org/{1}) Crawled from Zenodo ## Description {2}""".format( dataset["title"], dataset["doi_badge"], html2markdown.convert(dataset["description"]).replace( "\n", "<br />"))
def updatePages(): for page in PAGES: print("WOrking on %s " % page["SOURCE"]) with open(page["SOURCE"], "rb") as docx_file: result = mammoth.convert_to_html(docx_file, style_map=STYLE_MAP) #-- need to do some HTML processing here # - find embed spans html = updateHtml(result.value) md = html2markdown.convert(html) with open(page["DESTINATION"], "w", encoding="utf-8") as md_file: md_file.write(CSS) # add some css at the end md_file.write(md)
def bot_event(event, context): """ Main funciton for bot """ logger.info(event) logger.info(context) data = json.loads(event['body'])['data'] roomId = data['roomId'] personId = data['personId'] messageId = data['id'] logger.info('roomId: {}'.format(roomId)) logger.info('personId: {}'.format(personId)) # don't respond to yourself # else it will just be an infinate loop if not personId == MYID: # Get text from message given message id res = get( url="https://api.ciscospark.com/v1/messages/{}".format(messageId), headers=HEADERS) text = res.json()['text'] text = text.replace('Joey "The Machine" Ly', '') text = text.strip() # call bot and format response botResponse = call_bot(text) botResponse = botResponse.replace('target="_blank"', '') botResponse = html2markdown.convert(botResponse) logger.info('message: {}'.format(text)) logger.info('bot response: {}'.format(botResponse)) # post resposne to room as joeybot res = post(url="https://api.ciscospark.com/v1/messages", headers=HEADERS, data={ "markdown": botResponse, "roomId": roomId }) logger.info('response from bot post: {}'.format(res)) response = { "statusCode": 200, } return response
def _from_html(cls, html): self = cls() self.html = html self.text = html2markdown.convert(html) return self