Exemple #1
0
def getArticleHtml(name, link, index_loc):
    soup = readee.export(link)
    funcs = [
        lambda x: x.find('div', {'property': 'articleBody'}),
        lambda x: x.find('article'),
        lambda x: x.find('div', {'id': 'story-body'}),
    ]
    for f in funcs:
        new_soup = f(soup)
        if new_soup:
            soup = new_soup
    for item in soup.find_all('h2'):
        new_item = fact().new_tag('h4')
        new_item.string = item.text
        item.replace_with(new_item)
    if len(soup.text) < 100:
        return
    return '''
<html>
	<body>
		<title>%s</title>
		<h1>%s</h1>
		<div><a href="%s">返回目录</a></div>
		%s
		<div><br/><a href="%s">原文</a></div>
		<div><br/><a href="%s">返回目录</a></div>
	</body>
</html>
	''' % (name, name, index_loc, str(soup), link, index_loc)
Exemple #2
0
def getArticleHtml(name, link, index_loc):
	content = None
	if 'bbc' in link:
		content = cached_url.get(link, force_cache=True, sleep = 5)
	args = {}
	if 'twreporter.org/' in link:
		args['toSimplified'] = True
	soup = readee.export(link, content = content, **args)
	funcs = [
		lambda x: x.find('div', {'property': 'articleBody'}),
		lambda x: x.find('article'),
		lambda x: x.find('div', {'id': 'story-body'}),
	]
	for f in funcs:
		new_soup = f(soup)
		if new_soup:
			soup = new_soup
	for item in soup.find_all('h2'):
		new_item = fact().new_tag('h4')
		new_item.string = item.text
		item.replace_with(new_item)
	if len(soup.text) < 100:
		return
	return '''
<html>
	<body>
		<title>%s</title>
		<h1>%s</h1>
		<div><a href="%s">返回目录</a></div>
		%s
		<div><br/><a href="%s">原文</a></div>
		<div><br/><a href="%s">返回目录</a></div>
	</body>
</html>
	''' % (name, name, index_loc, str(soup), link, index_loc)
Exemple #3
0
def test():
    for url in urls:
        print('原文:', url)
        name = getFileName(url)
        with open(name, 'w') as f:
            f.write(str(readee.export(url, toSimplified=False)))
        print('导出:', name)
        os.system('open ' + name + ' -g')
Exemple #4
0
def get(path):
	content = cached_url.get(path)
	b = readee.export(path, content=content)
	result = Result()
	result.imgs = getImgs(b)
	result.cap = getCap(b)
	result.video = getVideo(b)
	return result
def check(link):
    try:
        content = cached_url.get(link, force_cache=True)
    except:
        return False
    soup = readee.export(link, content=content)
    if 200 < cnWordCount(soup.text) < 2500:
        return True
    return False
Exemple #6
0
def crawl(name):
    url = WIKI_PREFIX + name
    content = readee.export(url)
    for item in content.find_all('div', class_='thumbcaption'):
        if item.parent.name != 'figcaption':
            item.decompose()
        else:
            item.replaceWith(item.text)
    with open('result/' + name + '.html', 'w') as f:
        f.write(str(content))
Exemple #7
0
def getArticleHtml(name, link_list, index_loc):
    result = ''
    for link in link_list:
        soup = readee.export(link)
        if len(soup.text) < 100:
            continue
        result += str(soup)
    if not result:
        return
    return getHtml(
        name, '''
		%s
		<div><br/><a href="%s">原文</a></div>
	''' % (result, link), index_loc)
def _getArticle(url,
                toSimplified=False,
                force_cache=False,
                noAutoConvert=False):
    content = getContent(url, force_cache=force_cache)
    soup = BeautifulSoup(_trimWebpage(content), 'html.parser')
    article_url = _findUrl(url, soup)
    doc = Document(content)
    title = _findTitle(soup, doc)
    to_simplify_calculated = calculateToSimplified(toSimplified, noAutoConvert,
                                                   title)
    article = _Article(
        title, _findAuthor(soup),
        readee.export(url,
                      content=content,
                      list_replace=True,
                      toSimplified=to_simplify_calculated), article_url)
    if to_simplify_calculated:
        article.title = cc.convert(article.title)
        article.author = cc.convert(article.author)
    return article