Exemple #1
0
    def test_header(self):
        result = html2markdown.convert('<p># test</p>')
        bs = bs4.BeautifulSoup(markdown.markdown(result), 'html.parser')
        self.assertEqual(len(bs.find_all('h1')), 0)

        result = html2markdown.convert('<p><h1>test</h1></p>')
        bs = bs4.BeautifulSoup(markdown.markdown(result), 'html.parser')
        self.assertEqual(len(bs.find_all('h1')), 1)
Exemple #2
0
    def test_links(self):
        result = html2markdown.convert('<p>[http://google.com](test)</p>')
        bs = bs4.BeautifulSoup(markdown.markdown(result), 'html.parser')
        self.assertEqual(len(bs.find_all('a')), 0)

        result = html2markdown.convert(
            '<p><a href="http://google.com">test</a></p>')
        bs = bs4.BeautifulSoup(markdown.markdown(result), 'html.parser')
        self.assertEqual(len(bs.find_all('a')), 1)
Exemple #3
0
def charger_article(f: Path,subfolder) -> (str, str, str, str, Dt,str):
    """
    Charge un fichier markdown comme article.

    Parameters
    ----------
    f : Path
        Chemin de l'article à charger.

    Returns
    -------
    (str, str, str, str, Dt)
        Arguments pour la fonction nouvel_item.

    """
    global_link="https://l-electron-libre.github.io/texte/"+subfolder

    pubDate = Dt.fromtimestamp(f.stat().st_ctime)


    with f.open(encoding='utf-8') as d:
        soup = BeautifulSoup(d.read(),features="html.parser")

        soup2 = soup.find("section", {"id": "One"})
        soup2 = soup2.findChildren(recursive=False)[0]
        title = soup2.find("h2")
        title = unidecode.unidecode(title.get_text())
        link =global_link+"/"+f.name

        soup = soup.find("section", {"id": "two"})
        soup = soup.findChildren(recursive=False)[0]
        title2=soup.find("h2")
        title2 = unidecode.unidecode(title2.get_text())
        title = title + " - " +html2markdown.convert(title2)





        author = soup.find("h3")
        if author is None :
            author=""
        else :
            author=html2markdown.convert(author.get_text())

        description = soup.find("p")
        description = unidecode.unidecode(description.get_text())

        description =  html2markdown.convert(description)

    # L'ordre est important, voir nouvel_item
    return title, link, author, description,pubDate
def pack_problem(url):
    data = fetch(url)
    qid = data["questionId"]
    title = data["title"]
    difficulty = data["difficulty"]
    stat = json.loads(data["stats"])
    content = data["content"]
    md = html2markdown.convert(content)
    md = convert_html_tags(md)

    d = Path(f'{qid}_{title.replace(" ", "_")}')
    d.mkdir()
    with open(d / "README.md", "w") as f:
        f.write(f"### [{qid}. {title}]({url})\n\n")
        f.write(f"{difficulty}\n\n")
        f.write(f"{md}\n\n")
        f.write(convert_stat_table(stat))

    with open(d / "NOTE.md", "w") as f:
        f.write(f"# Notes on Success\n")
        f.write(f"+ \n\n")
        f.write(f"> Time : O() , Space : O()")

    code = [s for s in data["codeSnippets"]
            if s["lang"] == "Python3"][0]["code"]
    with open(d / "solution.py", "w") as f:
        f.write(
            TEMPLATE.format(content=remove_html_tags(content).strip(),
                            code=code.strip()).strip())
Exemple #5
0
    def embed_formatter(self, search_terms, results, count):

        results = self.searcher._dedupe(results)
        total = len(results)

        if total < count:
            count = total

        if count == 0:
            return [f"Query {search_terms} yielded no results."]

        output = [
            f"Query {search_terms} yielded {total} results. Showing the top {count}:"
        ]
        for result in results[:count]:

            text = result.highlights("content", top=2)
            text = re.sub(r'<b class=".+?">', '<b>', text)
            text = html2markdown.convert(text)

            embed = discord.Embed(color=0x883333)
            embed.title = result['title']
            embed.url = self.url(result)
            embed.description = f'...{text}...'
            output.append(embed)
        return output
Exemple #6
0
def trait_cleanup_pass(struct):
    assert 'sections' not in struct, struct  # Right now no traits have other sections
    trait = struct['trait']
    if len(trait['sections']) == 0:
        del trait['sections']
    else:
        assert False, struct
    soup = BeautifulSoup(trait['text'], "html.parser")
    first = list(soup.children)[0]
    if first.name == "i":
        text = get_text(first)
        if text.find("Note from Nethys:") > -1:
            first.clear()
        first.unwrap()
    trait['text'] = str(soup).strip()
    if trait['text'] != "":
        assert 'text' not in struct, struct
        struct['text'] = html2markdown.convert(trait['text'])
    if len(trait.get('sections', [])) > 0:
        assert 'sections' not in struct, struct
        struct['sections'] = trait['sections']
    if trait.get('classes'):
        struct['classes'] = trait['classes']
    if trait.get('links'):
        assert 'links' not in struct, struct
        struct['links'] = trait['links']
    del struct['trait']
Exemple #7
0
 def test_inline_tag_escaping(self):
     """formatting characters should be escaped for inline-type tags"""
     for escChar in self.escapedChars:
         testStr = '<span>**escape me**</span>'
         expectedStr = '<span>\*\*escape me\*\*</span>'
         mdStr = html2markdown.convert(testStr)
         self.assertEqual(mdStr, expectedStr)
Exemple #8
0
    def _markdown_formatter(search_terms, results, count):
        """
        Prepare an array of text output fromm a result set.
        """
        results = Searcher._dedupe(results)
        total = len(results)

        if total < count:
            count = total

        if count == 0:
            return [f"Your query {search_terms} yielded no results."]

        output = [
            f"Your query {search_terms} yielded {total} results. Showing the top {count}:"
        ]

        for result in results[:count]:
            text = result.highlights("content", top=2)
            text = re.sub(r'<b class=".+?">', '<b>', text)
            text = html2markdown.convert(text)

            output.append(result['title'] + '\n' + textwrap.indent(
                textwrap.fill(f'...{text}...', width=120), prefix='    '))

        return output
Exemple #9
0
def gen_description(soup):
    description = {}
    description["plaintext"] = escape_description(soup.text)
    description["html"] = get_html(soup)
    description["markdown"] = html.unescape(
        html2markdown.convert(description["html"]))
    return description
Exemple #10
0
 def test_block_tag_escaping(self):
     """formatting characters should NOT be escaped for block-type tags (except <p>)"""
     for escChar in self.escapableChars:
         testStr = '<div>**escape me**</div>'.replace('*', escChar)
         expectedStr = '<div>**escape me**</div>'.replace('*', escChar)
         mdStr = html2markdown.convert(testStr)
         self.assertEqual(mdStr, expectedStr)
Exemple #11
0
 def test_p_escaping(self):
     """formatting characters should be escaped for p tags"""
     for escChar in self.escapedChars:
         testStr = '<p>**escape me**</p>'.replace('*', escChar)
         expectedStr = '\*\*escape me\*\*'.replace('*', escChar)
         mdStr = html2markdown.convert(testStr)
         self.assertEqual(mdStr, expectedStr)
Exemple #12
0
 def test_p_escaping_2(self):
     """ensure all escapable characters are retained for <p>"""
     for escChar in self.escapableChars:
         testStr = '<p>**escape me**</p>'.replace('*', escChar)
         mdStr = html2markdown.convert(testStr)
         reconstructedStr = markdown.markdown(mdStr)
         self.assertEqual(reconstructedStr, testStr)
Exemple #13
0
 def test_inline_tag_escaping_2(self):
     """ensure all escapable characters are retained for inline-type tags"""
     for escChar in self.escapableChars:
         testStr = '<p><span>**escape me**</span></p>'
         mdStr = html2markdown.convert(testStr)
         reconstructedStr = markdown.markdown(mdStr)
         self.assertEqual(reconstructedStr, testStr)
def process_conversion(file_name):

    cwd = os.getcwd()  # Get the current working directory (cwd)
    files = os.listdir(cwd)  # Get all the files in that directory
    print("Files in %r: %s" % (cwd, files))

    output_file_name = file_name.replace("html", "md")
    output_file = open(output_file_name, "w+")
    print("Output File Name             : ", output_file_name)

    with open(file_name, "r") as input_file:

        if converter == "html2markdown":
            md_str = html2markdown.convert(input_file)
            output_file.write(md_str)
        elif converter == "markdownify":
            md_str = md(input_file)
            output_file.write(md_str)
        elif converter == "tomd":
            md_str = tomd.Tomd(input_file.read()).markdown
            output_file.write(md_str)
        else:
            print("Not a valid converter")

    return input_file, output_file
Exemple #15
0
 def save_article(post_res):
     """抓取文章并保存
 
     Arguments:
         post_res {[dict]} -- [文章数据结构]
     """
     sha256 = hashlib.sha256()
     sha256.update(post_res['url'].encode('utf8'))
     hash_key = sha256.hexdigest()
     try:
         post = Posts.get(Posts.hash_key == hash_key)
         logging.error('文章已存在')
         return
     except Exception as e:
         logging.error(e)
     post = Posts()
     post.url = post_res['url']
     post.title = post_res['title']
     post.author = post_res['author']
     content = post_res['content']
     content_md = html2markdown.convert(content)
     post.content = content
     post.content_md = content_md
     post.hash_key = hash_key
     post.create_time = datetime.datetime.now()
     post.pub_time = post_res['pub_time']
     post.source = post_res['source']
     post.save()
     if post.get_id():
         logging.info('保存文章%s' % post.title)
def convertFahrplan(parsedHTML: BeautifulSoup) -> Calendar:
    events = []
    congressTimeZone = pytz.timezone("Europe/Berlin")

    for el in parsedHTML.select("script[type='application/json']"):
        ej = json.loads(el.text)
        startT = ej["schedule_start"].replace("noon", "12:00:00").replace(
            "midnight", "00:00:00")
        startT = dateutil.parser.parse(startT).replace(tzinfo=congressTimeZone)
        duration = parseTimeDeltaStr(ej["schedule_duration"])
        descriptionText = html2markdown.convert(ej["description_html"])
        evt = Event()
        evt.add("uid", el["id"] + "@frab.cccv.de")
        speakers = ej["speakers"].split(", ")
        # for s in speakers:
        # 	evt.add("attendee;CN=" + ''.join(e for e in s if e.isalnum()), "")
        evt.add(
            "summary",
            "[" + ej["language"] + "] " + ej["title"] + "; " + ej["speakers"])
        evt.add("description", ej["track_name"] + "\n" + descriptionText)
        evt.add("location", ej["room_name"])
        evt.add("DTSTART", startT)
        evt.add("DTEND", startT + duration)
        evt.add("name", "[" + ej["language"] + "]" + ej["title"])
        events.append(evt)

    cal = Calendar()

    for evt in events:
        cal.add_component(evt)
    cal["summary"] = "Remote Congress Experience"

    return cal
Exemple #17
0
	def ParseChunk(self, frag):
		#print(dir(frag))
		ftg = frag.toHtml()
		gth = self.CleanGarbage(ftg)
		md = html2markdown.convert(gth)
		mth = markdown.markdown(md)
		stripped = self.StripP(mth)
		return stripped
Exemple #18
0
    def test_table_tag(self):
        """<table> tags should be converted. columns should preserve width across rows.
			td|th tag attr style="text-align: [left|center|right]" should be observed
		"""
        testStr = '<table><thead><tr><th><b>One</b></th><th style="text-align: right">Two</th></tr></thead>' \
            '<tbody><tr><td>Line 1</td><td>Second Line</td></tr></tbody></table>'
        expectedStr = u'| One    |         Two |\n| ------ | -----------:|\n| Line 1 | Second Line |'
        mdStr = html2markdown.convert(testStr)
        self.assertEqual(mdStr, expectedStr)
Exemple #19
0
    def test_h2(self):
        mdStr = html2markdown.convert(self.genericStr)
        reconstructedStr = markdown.markdown(mdStr)

        bs = bs4.BeautifulSoup(reconstructedStr, 'html.parser')
        childTags = bs.find_all(recursive=False)

        self.assertEqual(childTags[1].name, 'h2')
        self.assertEqual(childTags[1].string, 'Test')
def export_article(data):
    ts = int(data['datetime'])
    dt = datetime.utcfromtimestamp(ts)
    data['datetime'] = dt

    del_keys = ['l18n_diffsource']
    for key in del_keys:
        if key in data:
            del data[key]

    printdata = dict(data)
    if 'bodytext' in printdata:
        del printdata['bodytext']
    #pprint(printdata)

    frontmatter = {
        'title': data['title'],
        'summary': data.get('short'),
        'date': data['datetime'].isoformat(),
    }

    if data.get('author'):
        frontmatter['author'] = data.get('author')

    folder_path = 'export/%s-%s' % (dt.strftime('%Y-%m-%d'), data['id'])
    os.makedirs(folder_path, exist_ok=True)

    if data.get('image'):
        resources = []
        filenames = data.get('image').split(',')
        for fn in filenames:
            fid = None
            if fn not in content_files_filename_to_id:
                print("Error: Image %s not in content_files_filename_to_id" %
                      fn)
                continue
            fid = content_files_filename_to_id[fn]
            res = {'src': fn, 'title': ''}
            resources.append(res)

            # store file
            res_path = folder_path + "/" + fn
            with open(res_path, 'wb') as resfile:
                content = base64.b64decode(content_files[fid]['content'])
                resfile.write(content)

        frontmatter['resources'] = resources

    file_path = folder_path + '/index.md'

    with open(file_path, 'w') as myfile:
        myfile.write('---\n')
        myfile.write(yaml.dump(frontmatter, default_flow_style=False))
        myfile.write('---\n\n')
        myfile.write(html2markdown.convert(data['bodytext']) + '\n')
Exemple #21
0
def htmltomarkdown(text):
    """
    Safely convert html to markdown
    """
    try:
        content = html2markdown.convert(text)
    except Exception as exc:
        logger.error(exc)
        content = text

    return content
Exemple #22
0
def edit(request):
    title = request.POST.get('title')
    entry = request.POST.get('entry')
    return render(
        request, "encyclopedia/edit.html", {
            "form":
            NewEntryForm(initial={
                'title': title,
                'entry': html2markdown.convert(entry)
            })
        })
Exemple #23
0
def _sanitize_markdown(mdtext):
    "Removes unsafe text content from Markdown"
    dirty_html = markdown.markdown(mdtext)
    clean_html = bleach.clean(dirty_html,
                              strip=True,
                              tags=[
                                  *bleach.sanitizer.ALLOWED_TAGS, "h1", "h2",
                                  "h3", "h4", "h5", "h6"
                              ])
    print(clean_html)
    return html2markdown.convert(clean_html)
Exemple #24
0
def do_search(n_clicks, search_text):
    fragment_list = []

    if search_text is None:
        data_frame = pd.DataFrame([])
        data_frame['Date'] = []
        data_frame['Marker'] = []
    else:
        # TODO Where is the best place to call this?
        env.attachCurrentThread()

        hits = textSearcher.find_documents(search_text)

        if len(hits.scoreDocs) == 0:
            data_frame = pd.DataFrame([])
            data_frame['Date'] = []
            data_frame['Marker'] = []
        else:
            date_list = []

            for hit in hits.scoreDocs:
                document_number = hit.doc
                document = textSearcher.get_document(document_number)
                doc_name = document.getField("doc_name")
                date = datetime.datetime.strptime(doc_name.stringValue(),
                                                  '%m%d%y')
                date_list.append(date)

            data_frame = pd.DataFrame(date_list)
            data_frame['Marker'] = ['1'] * len(date_list)

            highlighted_hits = textSearcher.get_highlighted_hits()

            for highlighted_hit in highlighted_hits:
                counter = 0
                for hit in highlighted_hit[1]:
                    fragment_list.append(
                        html.Li(
                            html.A(dcc.Markdown(html2markdown.convert(hit)),
                                   id={
                                       'type': 'hit_document',
                                       'index': highlighted_hit[0]
                                   })))
                    counter += 1

    data_frame.columns = ['Date', 'Marker']
    scatterplot = px.scatter(data_frame,
                             x="Date",
                             y="Marker",
                             range_x=['2015-01-01', '2017-12-31'])

    print(fragment_list)

    return scatterplot, html.Ul(fragment_list)
Exemple #25
0
def htmltomarkdown(text):
    """
    Safely convert html to markdown
    """

    try:
        content = html2markdown.convert(text)
    except Exception as exc:
        logger.warning(f"error={exc};text={text[:100]}")
        content = html.escape(text)

    return content
Exemple #26
0
    def test_inline_tag_break(self):
        """inline-type tags should not cause line breaks"""
        emptyElements = self.emptyElements
        for tag in html2markdown._inlineTags:
            if tag not in emptyElements:
                testStr = '<p>test <%s>test</%s> test</p>' % (tag, tag)
            else:
                testStr = '<p>test <%s /> test</p>' % tag
            mdStr = html2markdown.convert(testStr)
            bs = bs4.BeautifulSoup(markdown.markdown(mdStr), 'html.parser')

            self.assertEqual(len(bs.find_all('p')), 1)
    def __parser(self, doc):
        soup = BS(doc, "html.parser")

        table = soup.select_one("div.box.box-primary.collapsed-box")
        table = table.select("table")[1]
        entry = parse_table(table)[0]
        self.__turma = entry["Turma"].text.strip()
        self.__prof = entry["Docentes"].text.strip()

        avaliacoes = soup.select_one("div#avaliacoes")

        situacao = avaliacoes.select_one("div.row.color-gray").select_one(
            "span")
        situacao.select_one("strong").extract()
        self.__situacao = situacao.text.strip()

        ma = avaliacoes.select_one("div#ma")

        formula = ma.select("span")
        self.__formula = formula[1].text.strip() if len(formula) >= 2 else None

        mf = Nota._parser(ma.select_one("div.row"))

        self.__notas = [mf] if mf else []
        divs = (avaliacoes.select("div.box.box-primary")[1].select_one(
            "div.row").find_all("div", recursive=False))
        for div in divs:
            for row in div.select("div.row")[1:-1]:
                nota = Nota._parser(row)
                if nota:
                    self.__notas.append(nota)

        frequencias = soup.select_one("div#frequencias")

        self.__frequencias = []

        for div in frequencias.select("div.box.box-primary")[1:]:
            year = div.select_one("div").text.strip().split("/")[-1]
            table = div.select_one("table>tbody")
            for tr in table.select("tr"):
                day_month, hour_minute, tipo, *chamadas = (
                    elem.text.strip() for elem in tr.select("td"))
                self.__frequencias.append(
                    Frequencia._parser(year, day_month, hour_minute, tipo,
                                       chamadas))

        planos = soup.select_one("div#planos")
        for h3 in planos.select("h3"):
            del h3.attrs["class"]
        planos = "".join(
            re.compile(r"<div.*>|<\/div>|\n+|\t+").split(str(planos)))
        self.__ementa = html2markdown.convert(planos)
Exemple #28
0
    def get_readme_content(self, dataset):
        return """# {0}

[![DOI](https://www.zenodo.org/badge/DOI/{1}.svg)](https://doi.org/{1})

Crawled from Zenodo

## Description

{2}""".format(
            dataset["title"], dataset["doi_badge"],
            html2markdown.convert(dataset["description"]).replace(
                "\n", "<br />"))
Exemple #29
0
def updatePages():
    for page in PAGES:
        print("WOrking on %s " % page["SOURCE"])
        with open(page["SOURCE"], "rb") as docx_file:
            result = mammoth.convert_to_html(docx_file, style_map=STYLE_MAP)
            #-- need to do some HTML processing here
            # - find embed spans
            html = updateHtml(result.value)

            md = html2markdown.convert(html)
            with open(page["DESTINATION"], "w", encoding="utf-8") as md_file:
                md_file.write(CSS)  # add some css at the end
                md_file.write(md)
Exemple #30
0
def bot_event(event, context):
    """
    Main funciton for bot
    """
    logger.info(event)
    logger.info(context)

    data = json.loads(event['body'])['data']

    roomId = data['roomId']
    personId = data['personId']
    messageId = data['id']

    logger.info('roomId: {}'.format(roomId))
    logger.info('personId: {}'.format(personId))

    # don't respond to yourself
    #   else it will just be an infinate loop
    if not personId == MYID:
        # Get text from message given message id
        res = get(
            url="https://api.ciscospark.com/v1/messages/{}".format(messageId),
            headers=HEADERS)

        text = res.json()['text']
        text = text.replace('Joey "The Machine" Ly', '')
        text = text.strip()

        # call bot and format response
        botResponse = call_bot(text)
        botResponse = botResponse.replace('target="_blank"', '')
        botResponse = html2markdown.convert(botResponse)

        logger.info('message: {}'.format(text))
        logger.info('bot response: {}'.format(botResponse))

        # post resposne to room as joeybot
        res = post(url="https://api.ciscospark.com/v1/messages",
                   headers=HEADERS,
                   data={
                       "markdown": botResponse,
                       "roomId": roomId
                   })

        logger.info('response from bot post: {}'.format(res))

    response = {
        "statusCode": 200,
    }

    return response
Exemple #31
0
 def _from_html(cls, html):
     self = cls()
     self.html = html
     self.text = html2markdown.convert(html)
     return self