Example #1
0
def class_tr_to_str(tr: bs4.element.Tag) -> str:
    for a in tr.find_all("a"):
        a["href"] = BASE_API_URL + a["href"]
    data = (tr.find("td", class_="header"), tr.find("td",
                                                    class_="description"))
    nameSpan = data[0].find("span", class_="element-name")
    if data[0].find("span", class_="attribute-type") is not None:
        accessType = "param"
        type_ = data[0].find("span", class_="param-type").text.strip()
    else:
        accessType = "func"
    if accessType == "param":
        attributeMode = data[0].find("span", class_="attribute-mode").text
        header = f"`{nameSpan.text} :: {type_}` {attributeMode}"
    else:
        header = f"`{nameSpan.text}`"

    contents = [item for item in data[1].contents if item != " "]
    if len(contents) > 0:
        if len(contents) > 1 and "\n" not in contents[0]:
            description = tomd.convert(
                f"<p>{''.join([str(item) for item in contents[:-1]])}</p>"
            ).strip()
        else:
            description = contents[0].strip()
        return f"{header} - {description}"
    else:
        return header
Example #2
0
def getNode(contentSoup):
    node = ArticleNode()
    node.depth = contentSoup['depth']
    node.id = contentSoup['id']
    node.name = contentSoup.find('a', class_='x-wiki-index-item').text.replace(
        '/', ' ')
    node.url = 'https://www.liaoxuefeng.com' + contentSoup.find(
        'a', class_='x-wiki-index-item')['href']

    print(node.toString())

    content = getUrlContent(node.url)
    if content != None:
        soup = BeautifulSoup(content, 'lxml')
        node.articleHTML = str(
            soup.find('div', class_="x-wiki-content x-main-content"))
        node.articleMD = tomd.convert(node.articleHTML)
        with open('output/' + node.name + '.md', 'w', encoding='utf-8') as fs:
            fs.write(node.articleMD)
            fs.flush()

    for item in contentSoup.find_all('div', depth=str(int(node.depth) + 1)):
        node.children.append(getNode(item))

    return node
Example #3
0
        def _get_new_data(self,page_url,soup):
            '''
            抽取有效数据
            :param page_url: 下载页面的 URL
            :param soup:
            :return: 返回有效数据
            '''

            data = {}
            dbStore = DataOutput()
            Htmldownloader = HtmlDownloader()


            data['article_url'] = page_url
            html =  Htmldownloader.download(page_url)
            ind_soup = BeautifulSoup(html, 'html.parser')
            title = ind_soup.find_all("h1",class_="csdn_top")
            data['title']=title[0].string
            updated = ind_soup.find_all("span",class_="time")
            data['update_date'] = updated[0].string
            pageview = ind_soup.find_all("span",class_="txt")
            data['pageviewcnt']= pageview[0].string
            articlecontent = ind_soup.find_all("div",class_="article_content")
            md = tomd.convert(articlecontent[0].prettify())
            # the page_content is too long to be saved in mysql blob data type , make that be saved on local folder as a file
            data['page_content']=pickle.dumps(md.encode("utf-8"))
            dbStore.store_data(data)
            print(data)
            return data
Example #4
0
def get_url_content(url):
    str_html = requests.get(url)
    html = BeautifulSoup(str_html.text, 'html.parser')
    title = html.find('h1').text
    date = html.find('span', {'class': 'time'}).text

    html.find('div', {'class': 'toc'}).decompose()
    html.find('svg').decompose()
    html.find('br').decompose()

    clean_tags(html)
    replace_h_tags(html)
    get_image(title, html)

    content = ''.join(map(str, html.find('div', {'id': 'content_views'}).contents))
    print(content)
    exit()
    md_content = tomd.convert(content)

    path = './content'
    file = title + '.md'
    file_path = path + '/' + file
    if not os.path.exists(path):
        os.makedirs(path)
    f = open(file_path, 'w', encoding="utf-8")
    f.write(str(md_content))
    f.close()
Example #5
0
    def _generate_ics(self):
        data = self.data
        if data != None:
            c = Calendar()
            days = list(data.keys())
            days.sort()

            for xday in days:
                day_data = data[xday]
                title = day_data[0].text_content()
                week, day = title.split(", ")
                week = w2n.word_to_num(week.lstrip("Week "))
                day = w2n.word_to_num(day.lstrip("Day "))
                offset = (week - 1) * 7 + (day)
                event_day = self.start_date + timedelta(days=offset)
                event_day = event_day.replace(hour=0,
                                              minute=0,
                                              second=0,
                                              microsecond=0)
                description = "".join(
                    [str(html.tostring(el)) for el in day_data])
                description = tomd.convert(description)
                e = Event(name="Magoosh {}".format(title),
                          begin=event_day,
                          end=event_day,
                          description=description)
                e.make_all_day()
                c.events.add(e)

            with open(self.out_file, 'w') as f:
                f.writelines(c)
            print("File written to {}".format(self.out_file))
Example #6
0
    def parse_content(self):
        """ parse data from html to markdown """

        self.markdown = []

        try:
            for item in self.data:
                if item:
                    for i in item:
                        t = str(i)
                        test = re.search("^!", t)
                        if test:
                            self.markdown.append(t)
                        md = tomd.convert(t)
                        self.markdown.append(md)

            filtered = list(
                filter(lambda x: not re.match(r'^\s*$', x), self.markdown))
            filtered = [x.replace('\n', '') for x in filtered]
            filtered = [x.replace('\t', '') for x in filtered]
            self.markdown = filtered

        except RuntimeError as err:
            print(Fore.RED, "[!] Recursion {0}".format(err))
            pass

        self.markdown.append('\n' + self.url)
Example #7
0
def anno_edit(request, pk):
    '''公告编辑'''
    anno = Announcement.objects.get(pk=pk)
    # 当为post请求时,修改数据
    if request.method == "POST":
        form = AnnoForm(request.POST, instance=anno)
        if form.is_valid():
            form.save()
            registerinfo = {
                'title': '修改成功',
                'subtitle': '数据更新成功',
                'status': 'success', }
            context = {
                'registerinfo': registerinfo,
                'anno': Announcement.objects.all(),
            }
            return render(request, 'backend/annolist.html', context=context)
        else:
            registerinfo = {
                'title': '错误',
                'subtitle': '数据填写错误',
                'status': 'error', }
            context = {
                'form': form,
                'registerinfo': registerinfo,
                'anno': anno,
            }
            return render(request, 'backend/annoedit.html', context=context)
    # 当请求不是post时,渲染form
    else:
        anno.body = tomd.convert(anno.body)
        context = {
            'anno': anno,
        }
        return render(request, 'backend/annoedit.html', context=context)
Example #8
0
def anno_edit(request, pk):
    '''公告编辑'''
    anno = Announcement.objects.get(pk=pk)
    # 当为post请求时,修改数据
    if request.method == "POST":
        form = AnnoForm(request.POST, instance=anno)
        if form.is_valid():
            form.save()
            registerinfo = {
                'title': '修改成功',
                'subtitle': '数据更新成功',
                'status': 'success', }
            context = {
                'registerinfo': registerinfo,
                'anno': Announcement.objects.all(),
            }
            return render(request, 'backend/annolist.html', context=context)
        else:
            registerinfo = {
                'title': '错误',
                'subtitle': '数据填写错误',
                'status': 'error', }
            context = {
                'form': form,
                'registerinfo': registerinfo,
                'anno': anno,
            }
            return render(request, 'backend/annoedit.html', context=context)
    # 当请求不是post时,渲染form
    else:
        anno.body = tomd.convert(anno.body)
        context = {
            'anno': anno,
        }
        return render(request, 'backend/annoedit.html', context=context)
Example #9
0
 def get(self, request, pk):
     anno = Announcement.objects.get(pk=pk)
     anno.body = tomd.convert(anno.body)
     context = {"anno": anno}
     return render(request,
                   "my_admin/announcement_detail.html",
                   context=context)
Example #10
0
def run():
    print('Fetching content...', end='')
    title, author, description, cta, img_url = get_meta_data()
    article = tomd.convert(get_article(cta))
    print('Done')

    date = datetime.now().strftime('%Y%m%d')
    commitMessage = f'{title} by {author}'
    fileName = os.path.join('blinks', f'{date[:4]}', f'{date}-{title}-{author}.md')

    print('Building output...', end='')
    # Convert to markdown, add source
    output = f'![{title}]({img_url})\n# {title}\n*{author}*\n\n>{description}\n\n{article}\n\nSource: [{commitMessage}](https://www.blinkist.com{cta})'
    print('Done')

    print(f'Committing file {fileName}...', end='')
    g = Github(repoToken)
    repo = g.get_repo(repoName)
    try:
        repo.create_file(fileName, commitMessage, output)
        print('Done')
        return 'OK'
    except GithubException:
        print('already exists')
        return 'File Exists'
Example #11
0
async def wiki_updated(name, feed_data, feed, feeds):
    time_latest_entry = feed_data['time_latest_entry']
    for i, entry in enumerate(feed.entries):
        if get_formatted_time(entry) > time_latest_entry:
            info_log('Found new wiki entry made on ' + entry.updated)
            summary = ''
            if re.search('<p.*?>.*?<\/p>', entry.summary):
                summary = html.unescape(
                    tomd.convert(
                        re.search('<p.*?>.*?<\/p>', entry.summary).group()))
                summary = re.sub(r'\((\/\S*)\)',
                                 r'(https://wiki.factorio.com\1)', summary)
                summary = re.sub('<bdi>|<\/bdi>', '', summary)
            embed = discord.Embed(
                title=f'{entry.author} changed {entry.title}',
                color=14103594,
                timestamp=datetime.datetime(*entry.updated_parsed[0:6]),
                url=entry.link,
                description=summary)
            channel = client.get_channel(feed_data['channel'])
            await channel.send(embed=embed)
        else:
            break
    feeds[name]['time_latest_entry'] = get_formatted_time(feed.entries[0])
    with open('feeds.json', 'w') as f:
        json.dump(feeds, f)
Example #12
0
def getMarkdown(entry, attr, selector, *args, **kwargs):
    if args or kwargs:
        if args and kwargs:
            snippet = soup.find_all(
                entry, attrs={attr, selector})[args[0]].find(
                    kwargs['opt_entry'],
                    attrs={kwargs['opt_attr'], kwargs['opt_selector']})
        elif kwargs and len(args) == 0:
            snippet = soup.find(entry, attrs={attr, selector}).find(
                kwargs['opt_entry'],
                attrs={kwargs['opt_attr'], kwargs['opt_selector']})
        uls = snippet.find('ul')
        if uls != None:
            snippet.ul.append(soup.new_tag('p'))
        snippet = tomd.convert(str(snippet)).replace('- ', '\n-  ').strip()
        return snippet
    return tomd.convert(str(soup.find(entry, attrs={attr, selector})))
def parse_article_content(bsObj, directory, title):
    # 1. Find html.
    html = bsObj.find('div', {'class': 'article_content'})
    md = tomd.convert(html.prettify())

    # 2. Write to the file.
    with open('%s/%s.md' % (directory, title), 'w', encoding='utf-8') as f:
        f.write(md)
Example #14
0
 def __str__(self) -> str:
     html = (
         "<h1> {}</h1>\n<pre><code>Exercise ID {}</code></pre>\n<h2> Assignment </h2>{}\n"
         .format(self.data.title, self.id, self.data.assignment) +
         self.get_pre_exercise_code() + self.get_instructions() +
         self.get_sample_code() + self.get_anwsers() + self.get_hints() +
         self.get_solution())
     return tomd.convert(html)
Example #15
0
def get_class_description(soup: bs4.BeautifulSoup, class_: str) -> str:
    a = soup.select(f"tr > td.header > a[href=\"{class_}.html\"]")
    if len(a) == 1:
        descriptionTag = a[0].parent.parent.find("td", class_="description")
        descriptionRaw = "".join(
            [str(content) for content in descriptionTag.contents])
        return tomd.convert(f"<p>{descriptionRaw}</p>").strip()
    return None
Example #16
0
def download_statement(url):
    req = requests.get(url)
    filename = _extract_name(url)
    dirname = _extract_dirname(url)
    if req.status_code == 200:
        statement = _extract_statement(req.content)
        markdown = tomd.convert(str(statement))
        _save_file(markdown, filename, dirname)
        print(f'Successful download of {filename}!')
    else:
        print(f'We cannot get the request of {filename}!')
Example #17
0
def extract_text(status: Status) -> str:
    as_html = status["content"]
    text: str = tomd.convert(as_html)
    tags = status["tags"]
    mentions = status["mentions"]
    text = clean_tags(text, tags)
    text = clean_mentions(text, mentions)
    text = text.replace("<br />", "\n")
    text = clean_links(text)
    text = html.unescape(text)
    return text
Example #18
0
def get_event_description(div: bs4.element.Tag) -> str:
    for a in div.find_all("a"):
        a["href"] = BASE_API_URL + a["href"]
    data = (div.select("div.element-content > p"),
            div.find("div", class_="detail-content"))
    paragraphs = []
    for p in data[0]:
        contents = p.contents
        if not (len(contents) == 1 and len(contents[0].strip()) == 0):
            paragraphs.append(html.unescape(tomd.convert(str(p))))
    return "\n".join([p.strip().replace("\n", "") for p in paragraphs])
Example #19
0
def getArticle(id):
    url = "https://blog-console-api.csdn.net/v1/editor/getArticle?id={}".format(id)
    print(url)
    response = requests.get(url,headers=headers);
    responseJson = json.loads(response.content)
    if responseJson['data']['markdowncontent'] == '':
        text = tomd.convert(responseJson['data']['content'])
        # parser = Parser(responseJson['data']['content'])
        responseJson['data']["markdowncontent"] = text
        # print(''.join(parser.outputs))
    write_txt(json.dumps(responseJson['data'],ensure_ascii=False),pwd+"/articles/"+id+".txt")
    time.sleep(1)
Example #20
0
def getList(entry, attr, selector, element):
    list = soup.find(entry, attrs={attr: selector}).find_all(element)
    list_md = ''
    counter = 0
    for idx, val in enumerate(list):
        if element == 'p':
            if val.get('class') == [u'counter']:
                counter += 1
                list_md += str(counter) + '. ' + val.text.strip() + '\n'
            else:
                list_md += '\n' + tomd.convert(str(val)).strip() + '\n'
        else:
            list_md += '- ' + val.text.strip() + '\n'
    return list_md
def get_meta_data():
    container = get_element_from_request('https://www.blinkist.com/nc/daily', 'div', 'daily-book__container')

    title = container.find('h3', 'daily-book__headline').string.strip()
    author = container.find('div', 'daily-book__author').string.strip()
    #     description = container.find('div', 'dailyV2__free-book__description').string.strip()
    description_html = container.find('div', 'book-tabs__content-inner')
    description = tomd.convert(str(description_html).strip())

    #     cta = container.find('div', 'dailyV2__free-book__cta').a['href']
    cta = container.find('a', 'daily-book__cta').get('href')
    img_url = container.find('img')['src']

    return title, author, description, cta, img_url
Example #22
0
 def convert(self):
     # 处理图片;
     index = 1
     for e in self.contentQ(self.rule.img_tag):
         q = pq(e)
         img_src = self.rule.find_img(q)
         img_src_cur = self.rule.save_pic(self.url, self.title, index,
                                          img_src)
         if q[0].tag != "img":
             q.replace_with(pq('<img src="' + img_src_cur + '"/>'))
         else:
             q.attr(src=img_src_cur)
         index += 1
     # 转换成markdown;
     self.rule.save_md(self.title, tomd.convert(self.contentQ))
Example #23
0
def extract_news(link):
    page = requests.get(link)
    soup = bs4.BeautifulSoup(page.text, 'html.parser')
    article_html = soup.select(".article")[0]
    article = str(article_html.encode('utf-8'))
    md = ''.join(
        tomd.convert(article).replace("\\n", "\n").replace(
            "\\xc2", "").replace("\\xa0", "").replace("\n\n\n", "\n").replace(
                "\n\n", "\n").replace("\\xe2\\x80\\x99", "'").split("|"))
    md = md.replace("\\xe2", "").replace("\\x86",
                                         "").replace("\\x92", "").replace(
                                             "\\x80", "").replace("\\93", "")
    md = extras.get_links(md)

    return (md)
Example #24
0
async def embed_fff(number):
    """
    Returns a discord.Embed object derived from an fff number
    """
    link = f"https://factorio.com/blog/post/fff-{number}"
    response = await get_soup(link)
    if response[0] == 200:
        soup = response[1]
        titleList = soup.find_all("h2")
        em = discord.Embed(title=titleList[0].string.strip(),
                           url=link,
                           colour=discord.Colour.dark_green())
        titleList = titleList[1:]
        if len(titleList) == 0:
            titleList = soup.find_all("h4")
        if len(titleList) == 0:
            titleList = soup.find_all("h3")
        for title in titleList:
            # Check for smaller font tag and append it to the title
            result = fontEx.search(str(title))
            if len([group for group in result.groups()
                    if group is not None]) == 1:
                name = result.group(1)
            else:
                name = result.group(1) + result.group(3)
            content = str(title.next_sibling.next_sibling)
            if "<p>" not in content:
                continue
            if "<ol>" in content:
                itemCount = 1
                while "<li>" in content:
                    content = content.replace("<li>", f"{itemCount}. ", 1)
                    itemCount += 1
            if "<ul>" in content:
                content = content.replace("<li>", "- ")
            for item in ["<ol>", "</ol>", "<ul>", "</ul>", "</li>", "<br/>"]:
                content = content.replace(item, "")
            # Escape Discord formatting characters
            for item in ["*", "_"]:
                content = content.replace(item, "\\" + item)
            content = content.replace("\n\n", "\n")
            em.add_field(name=name.replace("amp;", ""),
                         value=tomd.convert(content).strip())
    else:
        em = discord.Embed(title="Error",
                           description=f"Couldn't find FFF #{number}.",
                           colour=discord.Colour.red())
    return em
def write_md(url, md_count):
    html = urlopen(url)
    bsObj = BeautifulSoup(html, 'html.parser')

    title = bsObj.find('div', {'class': 'article-title-box'})
    title_convert = (title.h1).get_text()
    # print(title)
    # title_convert = tomd.convert(title.prettify())

    md = bsObj.find('div', {'class': 'article_content'})
    # print(md.prettify())
    convert = tomd.convert(md.prettify())
    md_name = str(md_count) + '.md'
    with open(md_name, 'w') as f:
        f.write(title_convert)
        f.write(convert)
def getContent(url):
    page = []
    (year,month) = re.search('http://scienceblogs.com/evolgen/(\d+)/(\d+)',url).groups()
    print("processing %s %s"%(month,year),end='\r')
    out = open('evolgen.md','wt')
    html = requests.get(url = url)
    bs = BeautifulSoup(html.content,'html.parser')
    header = bs.find('h1',{'class':'title entry-title'})
    page.append("#" + header.text)
    abbr = bs.find('abbr',{'class':'published'})
    page.append("Date: %s"%abbr.text)
    content = bs.find('div',{'class':'content entry-content'})
    for tag in content:
        if tag == "\n":
            continue
        page.append(tomd.convert(str(tag)))
    return page
Example #27
0
def define_tr_to_str(tr: bs4.element.Tag) -> str:
    for a in tr.find_all("a"):
        a["href"] = BASE_API_URL + a["href"]
    data = (tr.find("td",
                    class_="header").string, tr.find("td",
                                                     class_="description"))
    contents = data[1].contents
    if len(contents) > 0:
        if len(contents) > 1 and "\n" not in contents[0]:
            description = tomd.convert(
                f"<p>{''.join([str(item) for item in contents[:-1]])}</p>"
            ).strip()
        else:
            description = contents[0].split('\n')[0].strip()
        return f"`{data[0]}` - {description}"
    else:
        return f"`{data[0]}`"
def parse(package):
    package_name = get_package(package)
    url = 'https://github.com/%s/releases.atom' % package_name
    feed = feedparser.parse(url)
    entries = []
    for item in feed['entries']:
        owner, repo = package_name.split('/')
        version = re.search('(?<=Repository/)[0-9]+/(.+)', item['id']).group(1)
        authors = item['authors'][0]['name'] if 'authors' in item and item['authors'] and item['authors'][0] and item['authors'][0]['name'] else None,
        author = authors[0]
        content = ""
        for obj in item['content']:
            if obj['type'] == 'text/html':
                content += obj['value']
        content = content.replace("<br />", "\n")
        content = tomd.convert(content[:1024])
        while content.startswith('\n'):
            content = content[1:]
        while content.endswith('\n'):
            content = content[:-1]
        entries.append({
            "embeds": [{
                "title": "New release: %s" % version,
                "description": package_name,
                "url": item['link'],
                "thumbnail": {
                    "url": "https://github.com/%s.png" % owner,
                },
                "author": {
                    "name": author,
                    "url": "https://github.com/%s" % author,
                    "icon_url": "https://github.com/%s.png" % author,
                },
                "fields": [{
                    "name": item['title_detail']['value'],
                    "value": content[:1024],
                }],
                "footer": {
                    "text": time.strftime("%a %d %b, %Y at %I:%M %p", item['updated_parsed']),
                }
            }],
            "version": version,
            "package_name": package_name,
        })
    return entries
Example #29
0
def get_wiki_description(soup):
    """
    Returns the first paragraph of a wiki page BeautifulSoup
    """
    if soup.select(".mw-parser-output > p"):
        pNum = 0
        if headerEx.search(
                str(
                    soup.select(
                        ".mw-body-content > #mw-content-text > .mw-parser-output > p"
                    )[0])):
            pNum = 1
        return tomd.convert(
            str(
                soup.select(
                    ".mw-body-content > #mw-content-text > .mw-parser-output > p"
                )[pNum])).strip().replace("<br/>", "\n")
    return ""
Example #30
0
def anno_edit(request, pk):
    """公告编辑"""
    anno = Announcement.objects.get(pk=pk)
    # 当为post请求时,修改数据
    if request.method == "POST":
        form = AnnoForm(request.POST, instance=anno)
        if form.is_valid():
            form.save()
            messages.success(request, "数据更新成功", extra_tags="修改成功")
            return HttpResponseRedirect(reverse("sspanel:backend_anno"))
        else:
            messages.error(request, "数据填写错误", extra_tags="错误")
            context = {"form": form, "anno": anno}
            return render(request, "backend/annoedit.html", context=context)
    # 当请求不是post时,渲染form
    else:
        anno.body = tomd.convert(anno.body)
        context = {"anno": anno}
        return render(request, "backend/annoedit.html", context=context)
Example #31
0
 def convert(self, name="", selector=""):
     if not name:
         name = self.title
     if not selector:
         selector = self.xpath
     # 提取文章内容;
     contentQ = self.rootQ(selector)
     # 处理图片;
     index = 1
     for e in contentQ(self.rule.img_tag):
         q = pq(e)
         img_src = self.rule.find_img(q)
         img_src_cur = self.rule.save_pic(self.url, name, index, img_src)
         if q[0].tag != "img":
             q.replace_with(pq('<img src="' + img_src_cur + '"/>'))
         else:
             q.attr(src=img_src_cur)
         index += 1
     # 转换成markdown;
     self.rule.save_md(name, tomd.convert(contentQ))
Example #32
0
<i>italic</i>
<b><i>bold italic</i></b>
<em>em</em>
<strong>strong</strong>
aa <strong> strong   </strong> aa
</p>

<hr/>

<table>
<thead>
<tr class="1">
<th>th1</th>
<th>th2</th>
</tr>
</thead>
<tbody>
<tr>
<td>td</td>
<td>td</td>
</tr>
<tr>
<td>td</td>
<td>td</td>
</tr></tbody></table>

"""

print(Tomd(string).markdown)
print(tomd.convert(string))
Example #33
0
import tomd

f = open('in.txt', mode='r', encoding='UTF-8')
contents = f.read()

f_out = open('out.md', 'w+', encoding='UTF-8')
soup = tomd.convert(contents)
f_out.write(soup)

f.close()
f_out.close()