Exemple #1
0
from feed import start, get, strip

def get_content(url):
    r = get(url)
    q = pyquery.PyQuery(r.text.encode('EUC-JP', 'ignore'))
    [q(i).remove() for i in q.root.iter() if callable(i.tag)] # comment
    q('script, .posted, .amazlet-box, .poweredAdsBy, .menu').remove()
    q('.blogbody div, span, br').each(lambda i, e: e.attrib.clear())
    [strip(i) for i in q.root.iter()]
    content = q('.blogbody').html()
    content = re.sub(r'(<br/>)+', '<br/>', content)
    return content

def get_feed():
    f = feedparser.parse('http://blog.livedoor.jp/dqnplus/atom.xml')
    return {
        'link': f.feed.link,
        'title': f.feed.title,
        'entries': [{
            'link': e.link,
            'title': e.title,
            'author': e.author,
            'content': get_content(e.link),
            'updated': e.updated
        } for e in f.entries]
    }

if __name__ == '__main__':
    start('dqnplus', get_feed, 1800, 3600)
Exemple #2
0
        q = pyquery.PyQuery(r.text)
        for i in q('.gtr0, .gtr1'):
            link = q('.it5 a', i).attr('href')
            src = q(i).find('.it2 img').attr('src')
            title = q(i).find('.it2 img').attr('alt')
            if not src:
                t = q(i).find('.it2').text().split('~')
                src = 'http://' + t[1] + '/' + t[2]
                title = t[3]
            img = q('<img>').attr('src', src)
            a = q('<a>').attr('href', link).append(q('<div>').text(title)).append(img.wrap('<div>'))
            content = q('<div>').append(a).html()
            content = pystache.render(textwrap.dedent('''\
            <a href="{{link}}">
              <div>{{author}} {{title}}</div>
              <div><img src="{{src}}"></div>
            </a>'''), locals())
            published = q('.itd:first', i).text()
            yield {"link": link, "title": title, "author": author, "content": content, "published": published}
        time.sleep(3)

def get_feed():
    return {
        "link": 'http://g.e-hentai.org/',
        "title": 'e-hentai',
        "entries": get_entries()
    }

if __name__ == '__main__':
    start('e-hentai', get_feed, 7200, 86400)
Exemple #3
0
def get_entries():
    url = URL
    start = now()
    last = now()
    pages = 0
    while url and diff(start, last) < OLDEST_SEC:
        debug(u"diff {diff}", diff=diff(start, last))
        page = get_page(url)
        pages += 1
        posts = len(page['posts'])
        info(u"page {pages} {url} {posts} posts")
        for post in page['posts']:
            entry = get_entry(post['link'], post['prefix'], post['title'])
            yield entry
            last = min(last, entry['updated'])
            sleep()
        if not page['posts']:
            sleep()
        url = page['next_url']
        sleep()

def get_feed():
    return {
        'link': URL,
        'title': 'Gossiping',
        'entries': sorted(get_entries(), key=lambda e: e['published'], reverse=1)
    }

if __name__ == '__main__':
    start('Gossiping', get_feed, 1800, 3600)
Exemple #4
0
    q('.ad_amazon, .jin-ads, #other_news_website').remove()
    q('#popular_articles_comment, #hot_tweet, #category-link').remove()
    q('.related-articles, #ad2, .ent_ad_md, #ad_rs, #tags').remove()
    q('.tooltip, .comment_form, .article_header').remove()
    q(q('.article_bodymore > table')[-2:]).remove()
    q(q('#comment_list li')[30:]).remove()
    q('#comment_list li dl').replace_with(lambda i, x: x[1].text)
    q('#comment_list ul')[0].tag = 'ol'
    q('img').wrap('<div style="float: left !important">')
    [strip(i) for i in q.root.iter()]
    content = q('.article').html() + q('#comment').html()
    content = re.sub(r'(<br/>)+', '<br/>', content)
    return content

def get_feed():
    f = feedparser.parse('http://jin115.com/index.rdf')
    return {
        'link': f.feed.link,
        'title': f.feed.title,
        'entries': [{
            'link': e.link,
            'title': e.title,
            'author': e.author,
            'content': get_content(e.link),
            'updated': e.updated
        } for e in f.entries]
    }

if __name__ == '__main__':
    start('jin115', get_feed, 1800, 3600)
Exemple #5
0
def get_feed():
    return {"link": "http://avno1.playno1.com/", "title": "AV No.1", "entries": get_all_entries()}


def get_all_entries():
    for e in get_entries("http://www.playno1.com/portal.php?mod=list&catid=3"):
        yield e
    for e in get_entries("http://www.playno1.com/portal.php?mod=list&catid=4"):
        yield e


def get_entries(url):
    q = pq(url, redirect=False)
    for e in q(".fire_float"):
        href = q("a", e).attr("href")
        yield get_entry(href)
        time.sleep(3)


def get_entry(href):
    link = "http://www.playno1.com/" + href
    q = pq(link, redirect=False)
    title = q("h1:first").text()
    published, author = q("h1:first ~ p").text().split(u" | \u4f5c\u8005:")
    content = q("#article_content").html()
    return {"link": link, "title": title, "author": author, "content": content, "published": published}


if __name__ == "__main__":
    start("avno1", get_feed, 3600, 7200)
Exemple #6
0
async def start_listening(app: web.Application, feed: feed.StreamingFeed):
    app[STREAMING_FEED] = asyncio.create_task(feed.start())
Exemple #7
0
def get_author(e):
    return (re.findall('>(\w+)<', e.summary) or ['hackernews'])[0]

def get_content(e):
    url = re.findall(r'https://news.ycombinator.com/item\?id=\d+', e.summary)[0]
    q = pyquery.PyQuery('<div>')
    q.append(q('<a>').attr('href', url).text(url))
    for c in get_comments(url):
        q.append(q('<p>').html(c))
    return q.html()

def get_feed():
    f = feedparser.parse('http://hnbest.herokuapp.com/rss')
    return {
        'link': f.feed.link,
        'title': f.feed.title,
        'updated': f.feed.updated,
        'entries': [{
            'link': e.link,
            'title': get_title(e),
            'author': get_author(e),
            'content': get_content(e),
            'published': e.published,
            'updated': e.updated
        } for e in f.entries]
    }

if __name__ == '__main__':
    start('hackernews', get_feed, 3600, 7200)