from feed import start, get, strip def get_content(url): r = get(url) q = pyquery.PyQuery(r.text.encode('EUC-JP', 'ignore')) [q(i).remove() for i in q.root.iter() if callable(i.tag)] # comment q('script, .posted, .amazlet-box, .poweredAdsBy, .menu').remove() q('.blogbody div, span, br').each(lambda i, e: e.attrib.clear()) [strip(i) for i in q.root.iter()] content = q('.blogbody').html() content = re.sub(r'(<br/>)+', '<br/>', content) return content def get_feed(): f = feedparser.parse('http://blog.livedoor.jp/dqnplus/atom.xml') return { 'link': f.feed.link, 'title': f.feed.title, 'entries': [{ 'link': e.link, 'title': e.title, 'author': e.author, 'content': get_content(e.link), 'updated': e.updated } for e in f.entries] } if __name__ == '__main__': start('dqnplus', get_feed, 1800, 3600)
q = pyquery.PyQuery(r.text) for i in q('.gtr0, .gtr1'): link = q('.it5 a', i).attr('href') src = q(i).find('.it2 img').attr('src') title = q(i).find('.it2 img').attr('alt') if not src: t = q(i).find('.it2').text().split('~') src = 'http://' + t[1] + '/' + t[2] title = t[3] img = q('<img>').attr('src', src) a = q('<a>').attr('href', link).append(q('<div>').text(title)).append(img.wrap('<div>')) content = q('<div>').append(a).html() content = pystache.render(textwrap.dedent('''\ <a href="{{link}}"> <div>{{author}} {{title}}</div> <div><img src="{{src}}"></div> </a>'''), locals()) published = q('.itd:first', i).text() yield {"link": link, "title": title, "author": author, "content": content, "published": published} time.sleep(3) def get_feed(): return { "link": 'http://g.e-hentai.org/', "title": 'e-hentai', "entries": get_entries() } if __name__ == '__main__': start('e-hentai', get_feed, 7200, 86400)
def get_entries(): url = URL start = now() last = now() pages = 0 while url and diff(start, last) < OLDEST_SEC: debug(u"diff {diff}", diff=diff(start, last)) page = get_page(url) pages += 1 posts = len(page['posts']) info(u"page {pages} {url} {posts} posts") for post in page['posts']: entry = get_entry(post['link'], post['prefix'], post['title']) yield entry last = min(last, entry['updated']) sleep() if not page['posts']: sleep() url = page['next_url'] sleep() def get_feed(): return { 'link': URL, 'title': 'Gossiping', 'entries': sorted(get_entries(), key=lambda e: e['published'], reverse=1) } if __name__ == '__main__': start('Gossiping', get_feed, 1800, 3600)
q('.ad_amazon, .jin-ads, #other_news_website').remove() q('#popular_articles_comment, #hot_tweet, #category-link').remove() q('.related-articles, #ad2, .ent_ad_md, #ad_rs, #tags').remove() q('.tooltip, .comment_form, .article_header').remove() q(q('.article_bodymore > table')[-2:]).remove() q(q('#comment_list li')[30:]).remove() q('#comment_list li dl').replace_with(lambda i, x: x[1].text) q('#comment_list ul')[0].tag = 'ol' q('img').wrap('<div style="float: left !important">') [strip(i) for i in q.root.iter()] content = q('.article').html() + q('#comment').html() content = re.sub(r'(<br/>)+', '<br/>', content) return content def get_feed(): f = feedparser.parse('http://jin115.com/index.rdf') return { 'link': f.feed.link, 'title': f.feed.title, 'entries': [{ 'link': e.link, 'title': e.title, 'author': e.author, 'content': get_content(e.link), 'updated': e.updated } for e in f.entries] } if __name__ == '__main__': start('jin115', get_feed, 1800, 3600)
def get_feed(): return {"link": "http://avno1.playno1.com/", "title": "AV No.1", "entries": get_all_entries()} def get_all_entries(): for e in get_entries("http://www.playno1.com/portal.php?mod=list&catid=3"): yield e for e in get_entries("http://www.playno1.com/portal.php?mod=list&catid=4"): yield e def get_entries(url): q = pq(url, redirect=False) for e in q(".fire_float"): href = q("a", e).attr("href") yield get_entry(href) time.sleep(3) def get_entry(href): link = "http://www.playno1.com/" + href q = pq(link, redirect=False) title = q("h1:first").text() published, author = q("h1:first ~ p").text().split(u" | \u4f5c\u8005:") content = q("#article_content").html() return {"link": link, "title": title, "author": author, "content": content, "published": published} if __name__ == "__main__": start("avno1", get_feed, 3600, 7200)
async def start_listening(app: web.Application, feed: feed.StreamingFeed): app[STREAMING_FEED] = asyncio.create_task(feed.start())
def get_author(e): return (re.findall('>(\w+)<', e.summary) or ['hackernews'])[0] def get_content(e): url = re.findall(r'https://news.ycombinator.com/item\?id=\d+', e.summary)[0] q = pyquery.PyQuery('<div>') q.append(q('<a>').attr('href', url).text(url)) for c in get_comments(url): q.append(q('<p>').html(c)) return q.html() def get_feed(): f = feedparser.parse('http://hnbest.herokuapp.com/rss') return { 'link': f.feed.link, 'title': f.feed.title, 'updated': f.feed.updated, 'entries': [{ 'link': e.link, 'title': get_title(e), 'author': get_author(e), 'content': get_content(e), 'published': e.published, 'updated': e.updated } for e in f.entries] } if __name__ == '__main__': start('hackernews', get_feed, 3600, 7200)